source: trunk/lib/s2p.hpp @ 1889

Last change on this file since 1889 was 1760, checked in by ksherdy, 8 years ago

Revert s2p, p2s, 128, 256-bit specific versions.

File size: 8.2 KB
Line 
1/*  s2p128 - Serial to Parallel Bit Stream Transposition
2    Copyright (c) 2007, 2008, 2010, 2011  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters Inc.
5       under the Academic Free License version 3.0.
6*/
7
8#ifndef S2P_HPP
9#define S2P_HPP
10
11#include "idisa128.hpp"
12
13#define BytePack BitBlock
14
15/* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7,
16   s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7.
17
18   The following header shows the intent, although a macro is used for
19   speed.
20static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
21                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
22                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3,
23                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
24*/
25
26/*  1.  ALGORITHM Selection. 
27        Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack
28        Default is s2p_bytepack.
29        Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override.
30*/
31
32#ifdef USE_S2P_IDEAL
33#define S2P_ALGORITHM s2p_ideal
34#endif
35
36#ifdef USE_S2P_MOVEMASK
37#define S2P_ALGORITHM s2p_movemask
38#endif
39
40#ifndef S2P_ALGORITHM
41#define S2P_ALGORITHM s2p_bytepack
42#endif
43
44#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
45  S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7)
46
47/*  s2p_ideal is an ideal serial to parallel transposition
48    algorithm given an architecture with native support for
49    simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition
50    of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack
51    operations.
52*/
53
54#define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
55  do {\
56        BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\
57        bit4567_0, bit4567_1, bit4567_2, bit4567_3;\
58        BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\
59        bit0123_0 = hsimd<8>::packh(s0, s1);\
60        bit0123_1 = hsimd<8>::packh(s2, s3);\
61        bit0123_2 = hsimd<8>::packh(s4, s5);\
62        bit0123_3 = hsimd<8>::packh(s6, s7);\
63        bit4567_0 = hsimd<8>::packl(s0, s1);\
64        bit4567_1 = hsimd<8>::packl(s2, s3);\
65        bit4567_2 = hsimd<8>::packl(s4, s5);\
66        bit4567_3 = hsimd<8>::packl(s6, s7);\
67        bit01_0 = hsimd<4>::packh(bit0123_0, bit0123_1);\
68        bit01_1 = hsimd<4>::packh(bit0123_2, bit0123_3);\
69        bit23_0 = hsimd<4>::packl(bit0123_0, bit0123_1);\
70        bit23_1 = hsimd<4>::packl(bit0123_2, bit0123_3);\
71        bit45_0 = hsimd<4>::packh(bit4567_0, bit4567_1);\
72        bit45_1 = hsimd<4>::packh(bit4567_2, bit4567_3);\
73        bit67_0 = hsimd<4>::packl(bit4567_0, bit4567_1);\
74        bit67_1 = hsimd<4>::packl(bit4567_2, bit4567_3);\
75        p0 = hsimd<2>::packh(bit01_0, bit01_1);\
76        p1 = hsimd<2>::packl(bit01_0, bit01_1);\
77        p2 = hsimd<2>::packh(bit23_0, bit23_1);\
78        p3 = hsimd<2>::packl(bit23_0, bit23_1);\
79        p4 = hsimd<2>::packh(bit45_0, bit45_1);\
80        p5 = hsimd<2>::packl(bit45_0, bit45_1);\
81        p6 = hsimd<2>::packh(bit67_0, bit67_1);\
82        p7 = hsimd<2>::packl(bit67_0, bit67_1);\
83  } while(0)
84
85
86/*  s2p_bytepack is a fast serial to parallel transposition
87    algorithm given an architecture with simd_pack_16 operations,
88    but not at small field widths.
89    MMX, SSE, Altivec ...
90*/
91
92
93#ifndef USE_S2P_AVX
94#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
95  do {\
96        BitBlock t0,t1;\
97        t0 = hsimd<16>::packh(s0, s1);\
98        t1 = hsimd<16>::packl(s0, s1);\
99        p0 = simd<1>::ifh(hi_mask, t0, simd<16>::srli<shift>(t1));\
100        p1 = simd<1>::ifh(hi_mask, simd<16>::slli<shift>(t0), t1);\
101  } while(0)
102#endif
103
104
105/* For AVX, we use a modified s2p_step function to avoid a number
106   of conversions from 128-bit mode to 256-bit mode just to
107   immediately convert back. */
108#ifdef USE_S2P_AVX
109#define sse_andc(b1, b2) _mm_andnot_si128(b2, b1)
110#define sse_himask_16 _mm_set1_epi32(0xFF00FF00)
111#define sse_slli_16(r, shft) _mm_slli_epi16(r, shft)
112#define sse_srli_16(r, shft) _mm_srli_epi16(r, shft)
113#define sse_packus_16(a, b) _mm_packus_epi16(b, a)
114#define sse_pack_16(a, b) \
115  _mm_packus_epi16(sse_andc(b, sse_himask_16), sse_andc(a, sse_himask_16))
116#define sse_pack_16_ll(v1, v2) sse_pack_16(v1, v2)
117#define sse_pack_16_hh(v1, v2) sse_packus_16(sse_srli_16(v1, 8), sse_srli_16(v2, 8))
118
119#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
120  do {\
121        __m128i s00, s01, s10, s11, t00, t01, t10, t11;\
122        __m128i t10shift, t11shift, t00shift, t01shift;\
123        s00 = simd_hi128(s0);\
124        s01 = simd_lo128(s0);\
125        s10 = simd_hi128(s1);\
126        s11 = simd_lo128(s1);\
127        t00 = sse_pack_16_hh(s00, s01);\
128        t10 = sse_pack_16_ll(s00, s01);\
129        t01 = sse_pack_16_hh(s10, s11);\
130        t11 = sse_pack_16_ll(s10, s11);\
131        t10shift = sse_srli_16(t10, shift);\
132        t11shift = sse_srli_16(t11, shift);\
133        t00shift = sse_slli_16(t00, shift);\
134        t01shift = sse_slli_16(t01, shift);\
135        p0 = simd<1>::ifh(hi_mask, simd_combine256(t00, t01), simd_combine256(t10shift, t11shift));\
136        p1 = simd<1>::ifh(hi_mask, simd_combine256(t00shift, t01shift), simd_combine256(t10, t11));\
137  } while(0)
138#endif
139
140#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
141  do {\
142        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
143        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
144        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
145        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
146        s2p_step(s0,s1,simd<2>::himask(),1,bit00224466_0,bit11335577_0);\
147        s2p_step(s2,s3,simd<2>::himask(),1,bit00224466_1,bit11335577_1);\
148        s2p_step(s4,s5,simd<2>::himask(),1,bit00224466_2,bit11335577_2);\
149        s2p_step(s6,s7,simd<2>::himask(),1,bit00224466_3,bit11335577_3);\
150        s2p_step(bit00224466_0,bit00224466_1,simd<4>::himask(),2,bit00004444_0,bit22226666_0);\
151        s2p_step(bit00224466_2,bit00224466_3,simd<4>::himask(),2,bit00004444_1,bit22226666_1);\
152        s2p_step(bit11335577_0,bit11335577_1,simd<4>::himask(),2,bit11115555_0,bit33337777_0);\
153        s2p_step(bit11335577_2,bit11335577_3,simd<4>::himask(),2,bit11115555_1,bit33337777_1);\
154        s2p_step(bit00004444_0,bit00004444_1,simd<8>::himask(),4,p0,p4);\
155        s2p_step(bit11115555_0,bit11115555_1,simd<8>::himask(),4,p1,p5);\
156        s2p_step(bit22226666_0,bit22226666_1,simd<8>::himask(),4,p2,p6);\
157        s2p_step(bit33337777_0,bit33337777_1,simd<8>::himask(),4,p3,p7);\
158  } while(0)
159
160/* For sizeof(BitBlock) = 16 */
161typedef uint16_t BitPack;
162
163#define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \
164  do { \
165        union { BitPack bit_pack[8];\
166                BitBlock bit_block;\
167              } b;\
168        b.bit_pack[0] = hsimd<8>::signmask(s0);\
169        b.bit_pack[1] = hsimd<8>::signmask(s1);\
170        b.bit_pack[2] = hsimd<8>::signmask(s2);\
171        b.bit_pack[3] = hsimd<8>::signmask(s3);\
172        b.bit_pack[4] = hsimd<8>::signmask(s4);\
173        b.bit_pack[5] = hsimd<8>::signmask(s5);\
174        b.bit_pack[6] = hsimd<8>::signmask(s6);\
175        b.bit_pack[7] = hsimd<8>::signmask(s7);\
176        p = b.bit_block;\
177   } while (0)
178
179#define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \
180  do { \
181        t0 = simd<8>::add(s0, s0);\
182        t1 = simd<8>::add(s1, s1);\
183        t2 = simd<8>::add(s2, s2);\
184        t3 = simd<8>::add(s3, s3);\
185        t4 = simd<8>::add(s4, s4);\
186        t5 = simd<8>::add(s5, s5);\
187        t6 = simd<8>::add(s6, s6);\
188        t7 = simd<8>::add(s7, s7);\
189  } while (0)
190
191
192#define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
193  do { \
194        BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\
195        movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\
196        bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\
197        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\
198        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
199        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\
200        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
201        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\
202        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
203        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\
204        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
205        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\
206        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
207        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\
208        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
209        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\
210  } while (0)
211
212
213#endif // S2P_HPP
214
Note: See TracBrowser for help on using the repository browser.