1 | /* s2p - Serial to Parallel Bit Stream Transposition |
---|
2 | Copyright (c) 2007, 2008, 2010, Robert D. Cameron. |
---|
3 | Licensed to the public under the Open Software License 3.0. |
---|
4 | */ |
---|
5 | |
---|
6 | #ifndef S2P_H |
---|
7 | #define S2P_H |
---|
8 | |
---|
9 | #define BytePack SIMD_type |
---|
10 | #define BitBlock SIMD_type |
---|
11 | |
---|
12 | /* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7, |
---|
13 | s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7. |
---|
14 | |
---|
15 | The following header shows the intent, although a macro is used for |
---|
16 | speed. |
---|
17 | static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3, |
---|
18 | BytePack s5, BytePack s6, BytePack s7, BytePack s8, |
---|
19 | BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3, |
---|
20 | BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7); |
---|
21 | */ |
---|
22 | |
---|
23 | /* 1. ALGORITHM Selection. |
---|
24 | Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack |
---|
25 | Default is s2p_bytepack. |
---|
26 | Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override. |
---|
27 | */ |
---|
28 | |
---|
29 | #ifdef USE_S2P_IDEAL |
---|
30 | #define S2P_ALGORITHM s2p_ideal |
---|
31 | #endif |
---|
32 | |
---|
33 | #ifdef USE_S2P_MOVEMASK |
---|
34 | #define S2P_ALGORITHM s2p_movemask |
---|
35 | #endif |
---|
36 | |
---|
37 | #ifndef S2P_ALGORITHM |
---|
38 | #define S2P_ALGORITHM s2p_bytepack |
---|
39 | #endif |
---|
40 | |
---|
41 | |
---|
42 | |
---|
43 | #if (BYTE_ORDER == BIG_ENDIAN) |
---|
44 | #define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\ |
---|
45 | S2P_ALGORITHM(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) |
---|
46 | #endif |
---|
47 | #if (BYTE_ORDER == LITTLE_ENDIAN) |
---|
48 | #define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\ |
---|
49 | S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7) |
---|
50 | #endif |
---|
51 | |
---|
52 | |
---|
53 | /* s2p_ideal is an ideal serial to parallel transposition |
---|
54 | algorithm given an architecture with native support for |
---|
55 | simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition |
---|
56 | of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack |
---|
57 | operations. |
---|
58 | */ |
---|
59 | |
---|
60 | #define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
61 | do {\ |
---|
62 | BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\ |
---|
63 | bit4567_0, bit4567_1, bit4567_2, bit4567_3;\ |
---|
64 | BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\ |
---|
65 | bit0123_0 = simd_pack_8_hh(s0, s1);\ |
---|
66 | bit0123_1 = simd_pack_8_hh(s2, s3);\ |
---|
67 | bit0123_2 = simd_pack_8_hh(s4, s5);\ |
---|
68 | bit0123_3 = simd_pack_8_hh(s6, s7);\ |
---|
69 | bit4567_0 = simd_pack_8_ll(s0, s1);\ |
---|
70 | bit4567_1 = simd_pack_8_ll(s2, s3);\ |
---|
71 | bit4567_2 = simd_pack_8_ll(s4, s5);\ |
---|
72 | bit4567_3 = simd_pack_8_ll(s6, s7);\ |
---|
73 | bit01_0 = simd_pack_4_hh(bit0123_0, bit0123_1);\ |
---|
74 | bit01_1 = simd_pack_4_hh(bit0123_2, bit0123_3);\ |
---|
75 | bit23_0 = simd_pack_4_ll(bit0123_0, bit0123_1);\ |
---|
76 | bit23_1 = simd_pack_4_ll(bit0123_2, bit0123_3);\ |
---|
77 | bit45_0 = simd_pack_4_hh(bit4567_0, bit4567_1);\ |
---|
78 | bit45_1 = simd_pack_4_hh(bit4567_2, bit4567_3);\ |
---|
79 | bit67_0 = simd_pack_4_ll(bit4567_0, bit4567_1);\ |
---|
80 | bit67_1 = simd_pack_4_ll(bit4567_2, bit4567_3);\ |
---|
81 | p0 = simd_pack_2_hh(bit01_0, bit01_1);\ |
---|
82 | p1 = simd_pack_2_ll(bit01_0, bit01_1);\ |
---|
83 | p2 = simd_pack_2_hh(bit23_0, bit23_1);\ |
---|
84 | p3 = simd_pack_2_ll(bit23_0, bit23_1);\ |
---|
85 | p4 = simd_pack_2_hh(bit45_0, bit45_1);\ |
---|
86 | p5 = simd_pack_2_ll(bit45_0, bit45_1);\ |
---|
87 | p6 = simd_pack_2_hh(bit67_0, bit67_1);\ |
---|
88 | p7 = simd_pack_2_ll(bit67_0, bit67_1);\ |
---|
89 | } while(0) |
---|
90 | |
---|
91 | |
---|
92 | /* s2p_bytepack is a fast serial to parallel transposition |
---|
93 | algorithm given an architecture with simd_pack_16 operations, |
---|
94 | but not at small field widths. |
---|
95 | MMX, SSE, Altivec ... |
---|
96 | */ |
---|
97 | |
---|
98 | |
---|
99 | #ifndef USE_S2P_AVX |
---|
100 | #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ |
---|
101 | do {\ |
---|
102 | BitBlock t0,t1;\ |
---|
103 | t0 = simd_pack_16_hh(s0, s1);\ |
---|
104 | t1 = simd_pack_16_ll(s0, s1);\ |
---|
105 | p0 = simd_if(hi_mask, t0, simd_srli_16(t1, shift));\ |
---|
106 | p1 = simd_if(hi_mask, simd_slli_16(t0, shift), t1);\ |
---|
107 | } while(0) |
---|
108 | #endif |
---|
109 | |
---|
110 | |
---|
111 | /* For AVX, we use a modified s2p_step function to avoid a number |
---|
112 | of conversions from 128-bit mode to 256-bit mode just to |
---|
113 | immediately convert back. */ |
---|
114 | #ifdef USE_S2P_AVX |
---|
115 | #define sse_andc(b1, b2) _mm_andnot_si128(b2, b1) |
---|
116 | #define sse_himask_16 _mm_set1_epi32(0xFF00FF00) |
---|
117 | #define sse_slli_16(r, shft) _mm_slli_epi16(r, shft) |
---|
118 | #define sse_srli_16(r, shft) _mm_srli_epi16(r, shft) |
---|
119 | #define sse_packus_16(a, b) _mm_packus_epi16(b, a) |
---|
120 | #define sse_pack_16(a, b) \ |
---|
121 | _mm_packus_epi16(sse_andc(b, sse_himask_16), sse_andc(a, sse_himask_16)) |
---|
122 | #define sse_pack_16_ll(v1, v2) sse_pack_16(v1, v2) |
---|
123 | #define sse_pack_16_hh(v1, v2) sse_packus_16(sse_srli_16(v1, 8), sse_srli_16(v2, 8)) |
---|
124 | |
---|
125 | #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ |
---|
126 | do {\ |
---|
127 | __m128i s00, s01, s10, s11, t00, t01, t10, t11;\ |
---|
128 | __m128i t10shift, t11shift, t00shift, t01shift;\ |
---|
129 | s00 = simd_hi128(s0);\ |
---|
130 | s01 = simd_lo128(s0);\ |
---|
131 | s10 = simd_hi128(s1);\ |
---|
132 | s11 = simd_lo128(s1);\ |
---|
133 | t00 = sse_pack_16_hh(s00, s01);\ |
---|
134 | t10 = sse_pack_16_ll(s00, s01);\ |
---|
135 | t01 = sse_pack_16_hh(s10, s11);\ |
---|
136 | t11 = sse_pack_16_ll(s10, s11);\ |
---|
137 | t10shift = sse_srli_16(t10, shift);\ |
---|
138 | t11shift = sse_srli_16(t11, shift);\ |
---|
139 | t00shift = sse_slli_16(t00, shift);\ |
---|
140 | t01shift = sse_slli_16(t01, shift);\ |
---|
141 | p0 = simd_if(hi_mask, simd_combine256(t00, t01), simd_combine256(t10shift, t11shift));\ |
---|
142 | p1 = simd_if(hi_mask, simd_combine256(t00shift, t01shift), simd_combine256(t10, t11));\ |
---|
143 | } while(0) |
---|
144 | #endif |
---|
145 | |
---|
146 | |
---|
147 | |
---|
148 | #define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
149 | do {\ |
---|
150 | BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\ |
---|
151 | BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\ |
---|
152 | BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\ |
---|
153 | BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\ |
---|
154 | s2p_step(s0,s1,simd_himask_2,1,bit00224466_0,bit11335577_0);\ |
---|
155 | s2p_step(s2,s3,simd_himask_2,1,bit00224466_1,bit11335577_1);\ |
---|
156 | s2p_step(s4,s5,simd_himask_2,1,bit00224466_2,bit11335577_2);\ |
---|
157 | s2p_step(s6,s7,simd_himask_2,1,bit00224466_3,bit11335577_3);\ |
---|
158 | s2p_step(bit00224466_0,bit00224466_1,simd_himask_4,2,bit00004444_0,bit22226666_0);\ |
---|
159 | s2p_step(bit00224466_2,bit00224466_3,simd_himask_4,2,bit00004444_1,bit22226666_1);\ |
---|
160 | s2p_step(bit11335577_0,bit11335577_1,simd_himask_4,2,bit11115555_0,bit33337777_0);\ |
---|
161 | s2p_step(bit11335577_2,bit11335577_3,simd_himask_4,2,bit11115555_1,bit33337777_1);\ |
---|
162 | s2p_step(bit00004444_0,bit00004444_1,simd_himask_8,4,p0,p4);\ |
---|
163 | s2p_step(bit11115555_0,bit11115555_1,simd_himask_8,4,p1,p5);\ |
---|
164 | s2p_step(bit22226666_0,bit22226666_1,simd_himask_8,4,p2,p6);\ |
---|
165 | s2p_step(bit33337777_0,bit33337777_1,simd_himask_8,4,p3,p7);\ |
---|
166 | } while(0) |
---|
167 | |
---|
168 | |
---|
169 | |
---|
170 | |
---|
171 | |
---|
172 | /* For sizeof(SIMD_type) = 16 */ |
---|
173 | typedef uint16_t BitPack; |
---|
174 | |
---|
175 | #if (BYTE_ORDER == BIG_ENDIAN) |
---|
176 | #define movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p) \ |
---|
177 | do { \ |
---|
178 | union { BitPack bit_pack[8];\ |
---|
179 | SIMD_type bit_block;\ |
---|
180 | } b;\ |
---|
181 | b.bit_pack[0] = simd_movemask_8(s0);\ |
---|
182 | b.bit_pack[1] = simd_movemask_8(s1);\ |
---|
183 | b.bit_pack[2] = simd_movemask_8(s2);\ |
---|
184 | b.bit_pack[3] = simd_movemask_8(s3);\ |
---|
185 | b.bit_pack[4] = simd_movemask_8(s4);\ |
---|
186 | b.bit_pack[5] = simd_movemask_8(s5);\ |
---|
187 | b.bit_pack[6] = simd_movemask_8(s6);\ |
---|
188 | b.bit_pack[7] = simd_movemask_8(s7);\ |
---|
189 | p = b.bit_block;\ |
---|
190 | } while (0) |
---|
191 | #endif |
---|
192 | #if (BYTE_ORDER == LITTLE_ENDIAN) |
---|
193 | #define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \ |
---|
194 | do { \ |
---|
195 | union { BitPack bit_pack[8];\ |
---|
196 | SIMD_type bit_block;\ |
---|
197 | } b;\ |
---|
198 | b.bit_pack[0] = simd_movemask_8(s0);\ |
---|
199 | b.bit_pack[1] = simd_movemask_8(s1);\ |
---|
200 | b.bit_pack[2] = simd_movemask_8(s2);\ |
---|
201 | b.bit_pack[3] = simd_movemask_8(s3);\ |
---|
202 | b.bit_pack[4] = simd_movemask_8(s4);\ |
---|
203 | b.bit_pack[5] = simd_movemask_8(s5);\ |
---|
204 | b.bit_pack[6] = simd_movemask_8(s6);\ |
---|
205 | b.bit_pack[7] = simd_movemask_8(s7);\ |
---|
206 | p = b.bit_block;\ |
---|
207 | } while (0) |
---|
208 | #endif |
---|
209 | |
---|
210 | |
---|
211 | #define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \ |
---|
212 | do { \ |
---|
213 | t0 = simd_add_8(s0, s0);\ |
---|
214 | t1 = simd_add_8(s1, s1);\ |
---|
215 | t2 = simd_add_8(s2, s2);\ |
---|
216 | t3 = simd_add_8(s3, s3);\ |
---|
217 | t4 = simd_add_8(s4, s4);\ |
---|
218 | t5 = simd_add_8(s5, s5);\ |
---|
219 | t6 = simd_add_8(s6, s6);\ |
---|
220 | t7 = simd_add_8(s7, s7);\ |
---|
221 | } while (0) |
---|
222 | |
---|
223 | |
---|
224 | #define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
225 | do { \ |
---|
226 | BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\ |
---|
227 | movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\ |
---|
228 | bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
229 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\ |
---|
230 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
231 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\ |
---|
232 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
233 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\ |
---|
234 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
235 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\ |
---|
236 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
237 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\ |
---|
238 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
239 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\ |
---|
240 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
241 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\ |
---|
242 | } while (0) |
---|
243 | |
---|
244 | |
---|
245 | #endif |
---|
246 | |
---|