1 | /* s2p - Serial to Parallel Bit Stream Transposition |
---|
2 | Copyright (c) 2007, 2008, 2010, Robert D. Cameron. |
---|
3 | Licensed to the public under the Open Software License 3.0. |
---|
4 | */ |
---|
5 | |
---|
6 | #ifndef S2P_H |
---|
7 | #define S2P_H |
---|
8 | |
---|
9 | #define BytePack SIMD_type |
---|
10 | #define BitBlock SIMD_type |
---|
11 | |
---|
12 | /* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7, |
---|
13 | s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7. |
---|
14 | |
---|
15 | The following header shows the intent, although a macro is used for |
---|
16 | speed. |
---|
17 | static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3, |
---|
18 | BytePack s5, BytePack s6, BytePack s7, BytePack s8, |
---|
19 | BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3, |
---|
20 | BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7); |
---|
21 | */ |
---|
22 | |
---|
23 | /* 1. ALGORITHM Selection. |
---|
24 | Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack |
---|
25 | Default is s2p_bytepack. |
---|
26 | Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override. |
---|
27 | */ |
---|
28 | |
---|
29 | #ifdef USE_S2P_IDEAL |
---|
30 | #define S2P_ALGORITHM s2p_ideal |
---|
31 | #endif |
---|
32 | |
---|
33 | #ifdef USE_S2P_MOVEMASK |
---|
34 | #define S2P_ALGORITHM s2p_movemask |
---|
35 | #endif |
---|
36 | |
---|
37 | #ifndef S2P_ALGORITHM |
---|
38 | #define S2P_ALGORITHM s2p_bytepack |
---|
39 | #endif |
---|
40 | |
---|
41 | |
---|
42 | |
---|
43 | #if (BYTE_ORDER == BIG_ENDIAN) |
---|
44 | #define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\ |
---|
45 | S2P_ALGORITHM(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) |
---|
46 | #endif |
---|
47 | #if (BYTE_ORDER == LITTLE_ENDIAN) |
---|
48 | #define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\ |
---|
49 | S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7) |
---|
50 | #endif |
---|
51 | |
---|
52 | |
---|
53 | /* s2p_ideal is an ideal serial to parallel transposition |
---|
54 | algorithm given an architecture with native support for |
---|
55 | simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition |
---|
56 | of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack |
---|
57 | operations. |
---|
58 | */ |
---|
59 | |
---|
60 | #define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
61 | do {\ |
---|
62 | BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\ |
---|
63 | bit4567_0, bit4567_1, bit4567_2, bit4567_3;\ |
---|
64 | BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\ |
---|
65 | bit0123_0 = simd_pack_8_hh(s0, s1);\ |
---|
66 | bit0123_1 = simd_pack_8_hh(s2, s3);\ |
---|
67 | bit0123_2 = simd_pack_8_hh(s4, s5);\ |
---|
68 | bit0123_3 = simd_pack_8_hh(s6, s7);\ |
---|
69 | bit4567_0 = simd_pack_8_ll(s0, s1);\ |
---|
70 | bit4567_1 = simd_pack_8_ll(s2, s3);\ |
---|
71 | bit4567_2 = simd_pack_8_ll(s4, s5);\ |
---|
72 | bit4567_3 = simd_pack_8_ll(s6, s7);\ |
---|
73 | bit01_0 = simd_pack_4_hh(bit0123_0, bit0123_1);\ |
---|
74 | bit01_1 = simd_pack_4_hh(bit0123_2, bit0123_3);\ |
---|
75 | bit23_0 = simd_pack_4_ll(bit0123_0, bit0123_1);\ |
---|
76 | bit23_1 = simd_pack_4_ll(bit0123_2, bit0123_3);\ |
---|
77 | bit45_0 = simd_pack_4_hh(bit4567_0, bit4567_1);\ |
---|
78 | bit45_1 = simd_pack_4_hh(bit4567_2, bit4567_3);\ |
---|
79 | bit67_0 = simd_pack_4_ll(bit4567_0, bit4567_1);\ |
---|
80 | bit67_1 = simd_pack_4_ll(bit4567_2, bit4567_3);\ |
---|
81 | p0 = simd_pack_2_hh(bit01_0, bit01_1);\ |
---|
82 | p1 = simd_pack_2_ll(bit01_0, bit01_1);\ |
---|
83 | p2 = simd_pack_2_hh(bit23_0, bit23_1);\ |
---|
84 | p3 = simd_pack_2_ll(bit23_0, bit23_1);\ |
---|
85 | p4 = simd_pack_2_hh(bit45_0, bit45_1);\ |
---|
86 | p5 = simd_pack_2_ll(bit45_0, bit45_1);\ |
---|
87 | p6 = simd_pack_2_hh(bit67_0, bit67_1);\ |
---|
88 | p7 = simd_pack_2_ll(bit67_0, bit67_1);\ |
---|
89 | } while(0) |
---|
90 | |
---|
91 | |
---|
92 | /* s2p_bytepack is a fast serial to parallel transposition |
---|
93 | algorithm given an architecture with simd_pack_16 operations, |
---|
94 | but not at small field widths. |
---|
95 | MMX, SSE, Altivec ... |
---|
96 | */ |
---|
97 | |
---|
98 | #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ |
---|
99 | do {\ |
---|
100 | BitBlock t0,t1;\ |
---|
101 | t0 = simd_pack_16_hh(s0, s1);\ |
---|
102 | t1 = simd_pack_16_ll(s0, s1);\ |
---|
103 | p0 = simd_if(hi_mask, t0, simd_srli_16(t1, shift));\ |
---|
104 | p1 = simd_if(hi_mask, simd_slli_16(t0, shift), t1);\ |
---|
105 | } while(0) |
---|
106 | |
---|
107 | #define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
108 | do {\ |
---|
109 | BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\ |
---|
110 | BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\ |
---|
111 | BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\ |
---|
112 | BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\ |
---|
113 | s2p_step(s0,s1,simd_himask_2,1,bit00224466_0,bit11335577_0);\ |
---|
114 | s2p_step(s2,s3,simd_himask_2,1,bit00224466_1,bit11335577_1);\ |
---|
115 | s2p_step(s4,s5,simd_himask_2,1,bit00224466_2,bit11335577_2);\ |
---|
116 | s2p_step(s6,s7,simd_himask_2,1,bit00224466_3,bit11335577_3);\ |
---|
117 | s2p_step(bit00224466_0,bit00224466_1,simd_himask_4,2,bit00004444_0,bit22226666_0);\ |
---|
118 | s2p_step(bit00224466_2,bit00224466_3,simd_himask_4,2,bit00004444_1,bit22226666_1);\ |
---|
119 | s2p_step(bit11335577_0,bit11335577_1,simd_himask_4,2,bit11115555_0,bit33337777_0);\ |
---|
120 | s2p_step(bit11335577_2,bit11335577_3,simd_himask_4,2,bit11115555_1,bit33337777_1);\ |
---|
121 | s2p_step(bit00004444_0,bit00004444_1,simd_himask_8,4,p0,p4);\ |
---|
122 | s2p_step(bit11115555_0,bit11115555_1,simd_himask_8,4,p1,p5);\ |
---|
123 | s2p_step(bit22226666_0,bit22226666_1,simd_himask_8,4,p2,p6);\ |
---|
124 | s2p_step(bit33337777_0,bit33337777_1,simd_himask_8,4,p3,p7);\ |
---|
125 | } while(0) |
---|
126 | |
---|
127 | /* For sizeof(SIMD_type) = 16 */ |
---|
128 | typedef uint16_t BitPack; |
---|
129 | |
---|
130 | #if (BYTE_ORDER == BIG_ENDIAN) |
---|
131 | #define movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p) \ |
---|
132 | do { \ |
---|
133 | union { BitPack bit_pack[8];\ |
---|
134 | SIMD_type bit_block;\ |
---|
135 | } b;\ |
---|
136 | b.bit_pack[0] = simd_movemask_8(s0);\ |
---|
137 | b.bit_pack[1] = simd_movemask_8(s1);\ |
---|
138 | b.bit_pack[2] = simd_movemask_8(s2);\ |
---|
139 | b.bit_pack[3] = simd_movemask_8(s3);\ |
---|
140 | b.bit_pack[4] = simd_movemask_8(s4);\ |
---|
141 | b.bit_pack[5] = simd_movemask_8(s5);\ |
---|
142 | b.bit_pack[6] = simd_movemask_8(s6);\ |
---|
143 | b.bit_pack[7] = simd_movemask_8(s7);\ |
---|
144 | p = b.bit_block;\ |
---|
145 | } while (0) |
---|
146 | #endif |
---|
147 | #if (BYTE_ORDER == LITTLE_ENDIAN) |
---|
148 | #define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \ |
---|
149 | do { \ |
---|
150 | union { BitPack bit_pack[8];\ |
---|
151 | SIMD_type bit_block;\ |
---|
152 | } b;\ |
---|
153 | b.bit_pack[0] = simd_movemask_8(s0);\ |
---|
154 | b.bit_pack[1] = simd_movemask_8(s1);\ |
---|
155 | b.bit_pack[2] = simd_movemask_8(s2);\ |
---|
156 | b.bit_pack[3] = simd_movemask_8(s3);\ |
---|
157 | b.bit_pack[4] = simd_movemask_8(s4);\ |
---|
158 | b.bit_pack[5] = simd_movemask_8(s5);\ |
---|
159 | b.bit_pack[6] = simd_movemask_8(s6);\ |
---|
160 | b.bit_pack[7] = simd_movemask_8(s7);\ |
---|
161 | p = b.bit_block;\ |
---|
162 | } while (0) |
---|
163 | #endif |
---|
164 | |
---|
165 | |
---|
166 | #define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \ |
---|
167 | do { \ |
---|
168 | t0 = simd_add_8(s0, s0);\ |
---|
169 | t1 = simd_add_8(s1, s1);\ |
---|
170 | t2 = simd_add_8(s2, s2);\ |
---|
171 | t3 = simd_add_8(s3, s3);\ |
---|
172 | t4 = simd_add_8(s4, s4);\ |
---|
173 | t5 = simd_add_8(s5, s5);\ |
---|
174 | t6 = simd_add_8(s6, s6);\ |
---|
175 | t7 = simd_add_8(s7, s7);\ |
---|
176 | } while (0) |
---|
177 | |
---|
178 | |
---|
179 | #define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
---|
180 | do { \ |
---|
181 | BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\ |
---|
182 | movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\ |
---|
183 | bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
184 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\ |
---|
185 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
186 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\ |
---|
187 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
188 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\ |
---|
189 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
190 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\ |
---|
191 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
192 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\ |
---|
193 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
194 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\ |
---|
195 | bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\ |
---|
196 | movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\ |
---|
197 | } while (0) |
---|
198 | |
---|
199 | |
---|
200 | #endif |
---|
201 | |
---|