source: trunk/lib/s2p.hpp @ 1551

Last change on this file since 1551 was 1551, checked in by cameron, 8 years ago

s2p.hpp initial check-in.

File size: 8.9 KB
Line 
1/*  s2p - Serial to Parallel Bit Stream Transposition
2    Copyright (c) 2007, 2008, 2010, 2011  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4*/
5
6#ifndef S2P_HPP
7#define S2P_HPP
8
9#include "idisa.hpp"
10
11#define BytePack BitBlock
12
13/* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7,
14   s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7.
15
16   The following header shows the intent, although a macro is used for
17   speed.
18static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
19                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
20                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3,
21                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
22*/
23
24/*  1.  ALGORITHM Selection. 
25        Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack
26        Default is s2p_bytepack.
27        Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override.
28*/
29
30#ifdef USE_S2P_IDEAL
31#define S2P_ALGORITHM s2p_ideal
32#endif
33
34#ifdef USE_S2P_MOVEMASK
35#define S2P_ALGORITHM s2p_movemask
36#endif
37
38#ifndef S2P_ALGORITHM
39#define S2P_ALGORITHM s2p_bytepack
40#endif
41
42
43
44#if (BYTE_ORDER == BIG_ENDIAN)
45#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
46  S2P_ALGORITHM(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)
47#endif
48#if (BYTE_ORDER == LITTLE_ENDIAN)
49#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
50  S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7)
51#endif
52
53
54/*  s2p_ideal is an ideal serial to parallel transposition
55    algorithm given an architecture with native support for
56    simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition
57    of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack
58    operations.
59*/
60
61#define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
62  do {\
63        BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\
64        bit4567_0, bit4567_1, bit4567_2, bit4567_3;\
65        BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\
66        bit0123_0 = hsimd<8>::packh(s0, s1);\
67        bit0123_1 = hsimd<8>::packh(s2, s3);\
68        bit0123_2 = hsimd<8>::packh(s4, s5);\
69        bit0123_3 = hsimd<8>::packh(s6, s7);\
70        bit4567_0 = hsimd<8>::packl(s0, s1);\
71        bit4567_1 = hsimd<8>::packl(s2, s3);\
72        bit4567_2 = hsimd<8>::packl(s4, s5);\
73        bit4567_3 = hsimd<8>::packl(s6, s7);\
74        bit01_0 = hsimd<4>::packh(bit0123_0, bit0123_1);\
75        bit01_1 = hsimd<4>::packh(bit0123_2, bit0123_3);\
76        bit23_0 = hsimd<4>::packl(bit0123_0, bit0123_1);\
77        bit23_1 = hsimd<4>::packl(bit0123_2, bit0123_3);\
78        bit45_0 = hsimd<4>::packh(bit4567_0, bit4567_1);\
79        bit45_1 = hsimd<4>::packh(bit4567_2, bit4567_3);\
80        bit67_0 = hsimd<4>::packl(bit4567_0, bit4567_1);\
81        bit67_1 = hsimd<4>::packl(bit4567_2, bit4567_3);\
82        p0 = hsimd<2>::packh(bit01_0, bit01_1);\
83        p1 = hsimd<2>::packl(bit01_0, bit01_1);\
84        p2 = hsimd<2>::packh(bit23_0, bit23_1);\
85        p3 = hsimd<2>::packl(bit23_0, bit23_1);\
86        p4 = hsimd<2>::packh(bit45_0, bit45_1);\
87        p5 = hsimd<2>::packl(bit45_0, bit45_1);\
88        p6 = hsimd<2>::packh(bit67_0, bit67_1);\
89        p7 = hsimd<2>::packl(bit67_0, bit67_1);\
90  } while(0)
91
92
93/*  s2p_bytepack is a fast serial to parallel transposition
94    algorithm given an architecture with simd_pack_16 operations,
95    but not at small field widths.
96    MMX, SSE, Altivec ...
97*/
98
99
100#ifndef USE_S2P_AVX
101#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
102  do {\
103        BitBlock t0,t1;\
104        t0 = hsimd<16>::packh(s0, s1);\
105        t1 = hsimd<16>::packl(s0, s1);\
106        p0 = simd<1>::ifh(hi_mask, t0, simd<16>::srli<shift>(t1));\
107        p1 = simd<1>::ifh(hi_mask, simd<16>::slli<shift>(t0), t1);\
108  } while(0)
109#endif
110
111
112/* For AVX, we use a modified s2p_step function to avoid a number
113   of conversions from 128-bit mode to 256-bit mode just to
114   immediately convert back. */
115#ifdef USE_S2P_AVX
116#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
117#define sse_himask_16 _mm_set1_epi32(0xFF00FF00)
118#define sse_slli_16(r, shft) _mm_slli_epi16(r, shft)
119#define sse_srli_16(r, shft) _mm_srli_epi16(r, shft)
120#define sse_packus_16(a, b) _mm_packus_epi16(b, a)
121#define sse_pack_16(a, b) \
122  _mm_packus_epi16(sse_andc(b, sse_himask_16), sse_andc(a, sse_himask_16))
123#define sse_pack_16_ll(v1, v2) sse_pack_16(v1, v2)
124#define sse_pack_16_hh(v1, v2) sse_packus_16(sse_srli_16(v1, 8), sse_srli_16(v2, 8))
125
126#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
127  do {\
128        __m128i s00, s01, s10, s11, t00, t01, t10, t11;\
129        __m128i t10shift, t11shift, t00shift, t01shift;\
130        s00 = simd_hi128(s0);\
131        s01 = simd_lo128(s0);\
132        s10 = simd_hi128(s1);\
133        s11 = simd_lo128(s1);\
134        t00 = sse_pack_16_hh(s00, s01);\
135        t10 = sse_pack_16_ll(s00, s01);\
136        t01 = sse_pack_16_hh(s10, s11);\
137        t11 = sse_pack_16_ll(s10, s11);\
138        t10shift = sse_srli_16(t10, shift);\
139        t11shift = sse_srli_16(t11, shift);\
140        t00shift = sse_slli_16(t00, shift);\
141        t01shift = sse_slli_16(t01, shift);\
142        p0 = simd<1>::ifh(hi_mask, simd_combine256(t00, t01), simd_combine256(t10shift, t11shift));\
143        p1 = simd<1>::ifh(hi_mask, simd_combine256(t00shift, t01shift), simd_combine256(t10, t11));\
144  } while(0)
145#endif
146
147
148
149#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
150  do {\
151        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
152        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
153        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
154        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
155        s2p_step(s0,s1,simd<2>::himask(),1,bit00224466_0,bit11335577_0);\
156        s2p_step(s2,s3,simd<2>::himask(),1,bit00224466_1,bit11335577_1);\
157        s2p_step(s4,s5,simd<2>::himask(),1,bit00224466_2,bit11335577_2);\
158        s2p_step(s6,s7,simd<2>::himask(),1,bit00224466_3,bit11335577_3);\
159        s2p_step(bit00224466_0,bit00224466_1,simd<4>::himask(),2,bit00004444_0,bit22226666_0);\
160        s2p_step(bit00224466_2,bit00224466_3,simd<4>::himask(),2,bit00004444_1,bit22226666_1);\
161        s2p_step(bit11335577_0,bit11335577_1,simd<4>::himask(),2,bit11115555_0,bit33337777_0);\
162        s2p_step(bit11335577_2,bit11335577_3,simd<4>::himask(),2,bit11115555_1,bit33337777_1);\
163        s2p_step(bit00004444_0,bit00004444_1,simd<8>::himask(),4,p0,p4);\
164        s2p_step(bit11115555_0,bit11115555_1,simd<8>::himask(),4,p1,p5);\
165        s2p_step(bit22226666_0,bit22226666_1,simd<8>::himask(),4,p2,p6);\
166        s2p_step(bit33337777_0,bit33337777_1,simd<8>::himask(),4,p3,p7);\
167  } while(0)
168
169
170
171
172
173/* For sizeof(SIMD_type) = 16 */ 
174typedef uint16_t BitPack;
175
176#if (BYTE_ORDER == BIG_ENDIAN)
177#define movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p) \
178  do { \
179        union { BitPack bit_pack[8];\
180                SIMD_type bit_block;\
181              } b;\
182        b.bit_pack[0] = hsimd<8>::signmask(s0);\
183        b.bit_pack[1] = hsimd<8>::signmask(s1);\
184        b.bit_pack[2] = hsimd<8>::signmask(s2);\
185        b.bit_pack[3] = hsimd<8>::signmask(s3);\
186        b.bit_pack[4] = hsimd<8>::signmask(s4);\
187        b.bit_pack[5] = hsimd<8>::signmask(s5);\
188        b.bit_pack[6] = hsimd<8>::signmask(s6);\
189        b.bit_pack[7] = hsimd<8>::signmask(s7);\
190        p = b.bit_block;\
191   } while (0)
192#endif
193#if (BYTE_ORDER == LITTLE_ENDIAN)
194#define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \
195  do { \
196        union { BitPack bit_pack[8];\
197                SIMD_type bit_block;\
198              } b;\
199        b.bit_pack[0] = hsimd<8>::signmask(s0);\
200        b.bit_pack[1] = hsimd<8>::signmask(s1);\
201        b.bit_pack[2] = hsimd<8>::signmask(s2);\
202        b.bit_pack[3] = hsimd<8>::signmask(s3);\
203        b.bit_pack[4] = hsimd<8>::signmask(s4);\
204        b.bit_pack[5] = hsimd<8>::signmask(s5);\
205        b.bit_pack[6] = hsimd<8>::signmask(s6);\
206        b.bit_pack[7] = hsimd<8>::signmask(s7);\
207        p = b.bit_block;\
208   } while (0)
209#endif
210
211
212#define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \
213  do { \
214        t0 = simd<8>::add(s0, s0);\
215        t1 = simd<8>::add(s1, s1);\
216        t2 = simd<8>::add(s2, s2);\
217        t3 = simd<8>::add(s3, s3);\
218        t4 = simd<8>::add(s4, s4);\
219        t5 = simd<8>::add(s5, s5);\
220        t6 = simd<8>::add(s6, s6);\
221        t7 = simd<8>::add(s7, s7);\
222  } while (0)
223
224
225#define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
226  do { \
227        BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\
228        movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\
229        bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\
230        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\
231        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
232        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\
233        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
234        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\
235        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
236        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\
237        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
238        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\
239        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
240        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\
241        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
242        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\
243  } while (0)
244
245
246#endif
247
Note: See TracBrowser for help on using the repository browser.