source: trunk/lib/s2p.h @ 890

Last change on this file since 890 was 712, checked in by cameron, 9 years ago

Avoid clobbering bytestream with s2p_movemask

File size: 7.3 KB
Line 
1/*  s2p - Serial to Parallel Bit Stream Transposition
2    Copyright (c) 2007, 2008, 2010, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4*/
5
6#ifndef S2P_H
7#define S2P_H
8
9#define BytePack SIMD_type
10#define BitBlock SIMD_type
11
12/* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7,
13   s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7.
14
15   The following header shows the intent, although a macro is used for
16   speed.
17static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
18                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
19                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3,
20                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
21*/
22
23/*  1.  ALGORITHM Selection. 
24        Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack
25        Default is s2p_bytepack.
26        Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override.
27*/
28
29#ifdef USE_S2P_IDEAL
30#define S2P_ALGORITHM s2p_ideal
31#endif
32
33#ifdef USE_S2P_MOVEMASK
34#define S2P_ALGORITHM s2p_movemask
35#endif
36
37#ifndef S2P_ALGORITHM
38#define S2P_ALGORITHM s2p_bytepack
39#endif
40
41
42
43#if (BYTE_ORDER == BIG_ENDIAN)
44#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
45  S2P_ALGORITHM(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)
46#endif
47#if (BYTE_ORDER == LITTLE_ENDIAN)
48#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
49  S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7)
50#endif
51
52
53/*  s2p_ideal is an ideal serial to parallel transposition
54    algorithm given an architecture with native support for
55    simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition
56    of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack
57    operations.
58*/
59
60#define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
61  do {\
62        BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\
63        bit4567_0, bit4567_1, bit4567_2, bit4567_3;\
64        BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\
65        bit0123_0 = simd_pack_8_hh(s0, s1);\
66        bit0123_1 = simd_pack_8_hh(s2, s3);\
67        bit0123_2 = simd_pack_8_hh(s4, s5);\
68        bit0123_3 = simd_pack_8_hh(s6, s7);\
69        bit4567_0 = simd_pack_8_ll(s0, s1);\
70        bit4567_1 = simd_pack_8_ll(s2, s3);\
71        bit4567_2 = simd_pack_8_ll(s4, s5);\
72        bit4567_3 = simd_pack_8_ll(s6, s7);\
73        bit01_0 = simd_pack_4_hh(bit0123_0, bit0123_1);\
74        bit01_1 = simd_pack_4_hh(bit0123_2, bit0123_3);\
75        bit23_0 = simd_pack_4_ll(bit0123_0, bit0123_1);\
76        bit23_1 = simd_pack_4_ll(bit0123_2, bit0123_3);\
77        bit45_0 = simd_pack_4_hh(bit4567_0, bit4567_1);\
78        bit45_1 = simd_pack_4_hh(bit4567_2, bit4567_3);\
79        bit67_0 = simd_pack_4_ll(bit4567_0, bit4567_1);\
80        bit67_1 = simd_pack_4_ll(bit4567_2, bit4567_3);\
81        p0 = simd_pack_2_hh(bit01_0, bit01_1);\
82        p1 = simd_pack_2_ll(bit01_0, bit01_1);\
83        p2 = simd_pack_2_hh(bit23_0, bit23_1);\
84        p3 = simd_pack_2_ll(bit23_0, bit23_1);\
85        p4 = simd_pack_2_hh(bit45_0, bit45_1);\
86        p5 = simd_pack_2_ll(bit45_0, bit45_1);\
87        p6 = simd_pack_2_hh(bit67_0, bit67_1);\
88        p7 = simd_pack_2_ll(bit67_0, bit67_1);\
89  } while(0)
90
91
92/*  s2p_bytepack is a fast serial to parallel transposition
93    algorithm given an architecture with simd_pack_16 operations,
94    but not at small field widths.
95    MMX, SSE, Altivec ...
96*/
97
98#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
99  do {\
100        BitBlock t0,t1;\
101        t0 = simd_pack_16_hh(s0, s1);\
102        t1 = simd_pack_16_ll(s0, s1);\
103        p0 = simd_if(hi_mask, t0, simd_srli_16(t1, shift));\
104        p1 = simd_if(hi_mask, simd_slli_16(t0, shift), t1);\
105  } while(0)
106
107#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
108  do {\
109        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
110        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
111        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
112        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
113        s2p_step(s0,s1,simd_himask_2,1,bit00224466_0,bit11335577_0);\
114        s2p_step(s2,s3,simd_himask_2,1,bit00224466_1,bit11335577_1);\
115        s2p_step(s4,s5,simd_himask_2,1,bit00224466_2,bit11335577_2);\
116        s2p_step(s6,s7,simd_himask_2,1,bit00224466_3,bit11335577_3);\
117        s2p_step(bit00224466_0,bit00224466_1,simd_himask_4,2,bit00004444_0,bit22226666_0);\
118        s2p_step(bit00224466_2,bit00224466_3,simd_himask_4,2,bit00004444_1,bit22226666_1);\
119        s2p_step(bit11335577_0,bit11335577_1,simd_himask_4,2,bit11115555_0,bit33337777_0);\
120        s2p_step(bit11335577_2,bit11335577_3,simd_himask_4,2,bit11115555_1,bit33337777_1);\
121        s2p_step(bit00004444_0,bit00004444_1,simd_himask_8,4,p0,p4);\
122        s2p_step(bit11115555_0,bit11115555_1,simd_himask_8,4,p1,p5);\
123        s2p_step(bit22226666_0,bit22226666_1,simd_himask_8,4,p2,p6);\
124        s2p_step(bit33337777_0,bit33337777_1,simd_himask_8,4,p3,p7);\
125  } while(0)
126
127/* For sizeof(SIMD_type) = 16 */ 
128typedef uint16_t BitPack;
129
130#if (BYTE_ORDER == BIG_ENDIAN)
131#define movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p) \
132  do { \
133        union { BitPack bit_pack[8];\
134                SIMD_type bit_block;\
135              } b;\
136        b.bit_pack[0] = simd_movemask_8(s0);\
137        b.bit_pack[1] = simd_movemask_8(s1);\
138        b.bit_pack[2] = simd_movemask_8(s2);\
139        b.bit_pack[3] = simd_movemask_8(s3);\
140        b.bit_pack[4] = simd_movemask_8(s4);\
141        b.bit_pack[5] = simd_movemask_8(s5);\
142        b.bit_pack[6] = simd_movemask_8(s6);\
143        b.bit_pack[7] = simd_movemask_8(s7);\
144        p = b.bit_block;\
145   } while (0)
146#endif
147#if (BYTE_ORDER == LITTLE_ENDIAN)
148#define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \
149  do { \
150        union { BitPack bit_pack[8];\
151                SIMD_type bit_block;\
152              } b;\
153        b.bit_pack[0] = simd_movemask_8(s0);\
154        b.bit_pack[1] = simd_movemask_8(s1);\
155        b.bit_pack[2] = simd_movemask_8(s2);\
156        b.bit_pack[3] = simd_movemask_8(s3);\
157        b.bit_pack[4] = simd_movemask_8(s4);\
158        b.bit_pack[5] = simd_movemask_8(s5);\
159        b.bit_pack[6] = simd_movemask_8(s6);\
160        b.bit_pack[7] = simd_movemask_8(s7);\
161        p = b.bit_block;\
162   } while (0)
163#endif
164
165
166#define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \
167  do { \
168        t0 = simd_add_8(s0, s0);\
169        t1 = simd_add_8(s1, s1);\
170        t2 = simd_add_8(s2, s2);\
171        t3 = simd_add_8(s3, s3);\
172        t4 = simd_add_8(s4, s4);\
173        t5 = simd_add_8(s5, s5);\
174        t6 = simd_add_8(s6, s6);\
175        t7 = simd_add_8(s7, s7);\
176  } while (0)
177
178
179#define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
180  do { \
181        BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\
182        movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\
183        bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\
184        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\
185        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
186        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\
187        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
188        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\
189        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
190        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\
191        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
192        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\
193        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
194        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\
195        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
196        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\
197  } while (0)
198
199
200#endif
201
Note: See TracBrowser for help on using the repository browser.