source: trunk/lib_c/s2p.h @ 4004

Last change on this file since 4004 was 3391, checked in by linmengl, 6 years ago

check in IDISA C library and other support libraries. Some template features still remain.

File size: 8.0 KB
Line 
1/* Generated by cpp2c.rb from ./s2p.hpp
2 * Use IDISA C support
3*/
4
5/*  s2p - Serial to Parallel Bit Stream Transposition
6    Copyright (c) 2007, 2008, 2010, 2011  Robert D. Cameron.
7    Licensed to the public under the Open Software License 3.0.
8    Licensed to International Characters Inc.
9       under the Academic Free License version 3.0.
10*/
11
12#ifndef S2P_H
13#define S2P_H
14
15#include "idisa128_c.h"
16
17#define BytePack BitBlock
18
19/* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7,
20   s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7.
21
22   The following header shows the intent, although a macro is used for
23   speed.
24static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
25                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
26                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3,
27                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
28*/
29
30/*  1.  ALGORITHM Selection. 
31        Choice of 3 algorithms: s2p_ideal, s2p_movemask, s2p_bytepack
32        Default is s2p_bytepack.
33        Compiling with -DUSE_S2P_IDEAL or -DUSE_S2P_MOVEMASK to override.
34*/
35
36#ifdef USE_S2P_IDEAL
37#define S2P_ALGORITHM s2p_ideal
38#endif
39
40#ifdef USE_S2P_MOVEMASK
41#define S2P_ALGORITHM s2p_movemask
42#endif
43
44#ifndef S2P_ALGORITHM
45#define S2P_ALGORITHM s2p_bytepack
46#endif
47
48#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
49  S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7)
50
51/*  s2p_ideal is an ideal serial to parallel transposition
52    algorithm given an architecture with native support for
53    simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition
54    of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack
55    operations.
56*/
57
58#define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
59  do {\
60        BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\
61        bit4567_0, bit4567_1, bit4567_2, bit4567_3;\
62        BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\
63        bit0123_0 = hsimd_packh_8(s0, s1);\
64        bit0123_1 = hsimd_packh_8(s2, s3);\
65        bit0123_2 = hsimd_packh_8(s4, s5);\
66        bit0123_3 = hsimd_packh_8(s6, s7);\
67        bit4567_0 = hsimd_packl_8(s0, s1);\
68        bit4567_1 = hsimd_packl_8(s2, s3);\
69        bit4567_2 = hsimd_packl_8(s4, s5);\
70        bit4567_3 = hsimd_packl_8(s6, s7);\
71        bit01_0 = hsimd_packh_4(bit0123_0, bit0123_1);\
72        bit01_1 = hsimd_packh_4(bit0123_2, bit0123_3);\
73        bit23_0 = hsimd_packl_4(bit0123_0, bit0123_1);\
74        bit23_1 = hsimd_packl_4(bit0123_2, bit0123_3);\
75        bit45_0 = hsimd_packh_4(bit4567_0, bit4567_1);\
76        bit45_1 = hsimd_packh_4(bit4567_2, bit4567_3);\
77        bit67_0 = hsimd_packl_4(bit4567_0, bit4567_1);\
78        bit67_1 = hsimd_packl_4(bit4567_2, bit4567_3);\
79        p0 = hsimd_packh_2(bit01_0, bit01_1);\
80        p1 = hsimd_packl_2(bit01_0, bit01_1);\
81        p2 = hsimd_packh_2(bit23_0, bit23_1);\
82        p3 = hsimd_packl_2(bit23_0, bit23_1);\
83        p4 = hsimd_packh_2(bit45_0, bit45_1);\
84        p5 = hsimd_packl_2(bit45_0, bit45_1);\
85        p6 = hsimd_packh_2(bit67_0, bit67_1);\
86        p7 = hsimd_packl_2(bit67_0, bit67_1);\
87  } while(0)
88
89
90/*  s2p_bytepack is a fast serial to parallel transposition
91    algorithm given an architecture with simd_pack_16 operations,
92    but not at small field widths.
93    MMX, SSE, Altivec ...
94*/
95
96
97#ifndef USE_S2P_AVX
98#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
99  do {\
100        BitBlock t0,t1;\
101        t0 = hsimd_packh_16(s0, s1);\
102        t1 = hsimd_packl_16(s0, s1);\
103        p0 = simd_ifh_1(hi_mask, t0, simd_srli_16(shift, t1));\
104        p1 = simd_ifh_1(hi_mask, simd_slli_16(shift, t0), t1);\
105  } while(0)
106#endif
107
108
109/* For AVX, we use a modified s2p_step function to avoid a number
110   of conversions from 128-bit mode to 256-bit mode just to
111   immediately convert back. */
112#ifdef USE_S2P_AVX
113#include "idisa_cpp/idisa_sse2.cpp"
114#define avx_select_lo128(x) \
115        ((__m128i) _mm256_castps256_ps128(x))
116
117#define avx_select_hi128(x) \
118        ((__m128i)(_mm256_extractf128_ps(x, 1)))
119
120#define avx_general_combine256(x, y) \
121   (_mm256_insertf128_ps(_mm256_castps128_ps256((__m128) y), (__m128) x, 1))
122
123#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
124  do {\
125        bitblock128_t s00, s01, s10, s11, t00, t01, t10, t11;\
126        bitblock128_t t10shift, t11shift, t00shift, t01shift;\
127        s00 = avx_select_hi128(s0);\
128        s01 = avx_select_lo128(s0);\
129        s10 = avx_select_hi128(s1);\
130        s11 = avx_select_lo128(s1);\
131        t00 = hsimd_packh_16(s00, s01);\
132        t10 = hsimd_packl_16(s00, s01);\
133        t01 = hsimd_packh_16(s10, s11);\
134        t11 = hsimd_packl_16(s10, s11);\
135        t10shift = simd_srli_16(shift, t10);\
136        t11shift = simd_srli_16(shift, t11);\
137        t00shift = simd_slli_16(shift, t00);\
138        t01shift = simd_slli_16(shift, t01);\
139        p0 = simd_ifh_1(hi_mask, avx_general_combine256(t00, t01), avx_general_combine256(t10shift, t11shift));\
140        p1 = simd_ifh_1(hi_mask, avx_general_combine256(t00shift, t01shift), avx_general_combine256(t10, t11));\
141  } while(0)
142#endif
143
144#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
145  do {\
146        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
147        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
148        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
149        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
150        s2p_step(s0,s1,simd_himask_2(),1,bit00224466_0,bit11335577_0);\
151        s2p_step(s2,s3,simd_himask_2(),1,bit00224466_1,bit11335577_1);\
152        s2p_step(s4,s5,simd_himask_2(),1,bit00224466_2,bit11335577_2);\
153        s2p_step(s6,s7,simd_himask_2(),1,bit00224466_3,bit11335577_3);\
154        s2p_step(bit00224466_0,bit00224466_1,simd_himask_4(),2,bit00004444_0,bit22226666_0);\
155        s2p_step(bit00224466_2,bit00224466_3,simd_himask_4(),2,bit00004444_1,bit22226666_1);\
156        s2p_step(bit11335577_0,bit11335577_1,simd_himask_4(),2,bit11115555_0,bit33337777_0);\
157        s2p_step(bit11335577_2,bit11335577_3,simd_himask_4(),2,bit11115555_1,bit33337777_1);\
158        s2p_step(bit00004444_0,bit00004444_1,simd_himask_8(),4,p0,p4);\
159        s2p_step(bit11115555_0,bit11115555_1,simd_himask_8(),4,p1,p5);\
160        s2p_step(bit22226666_0,bit22226666_1,simd_himask_8(),4,p2,p6);\
161        s2p_step(bit33337777_0,bit33337777_1,simd_himask_8(),4,p3,p7);\
162  } while(0)
163
164/* For sizeof(BitBlock) = 16 */
165typedef uint16_t BitPack;
166
167#define movemask_step(s7, s6, s5, s4, s3, s2, s1, s0, p) \
168  do { \
169        union { BitPack bit_pack[8];\
170                BitBlock bit_block;\
171              } b;\
172        b.bit_pack[0] = hsimd_signmask_8(s0);\
173        b.bit_pack[1] = hsimd_signmask_8(s1);\
174        b.bit_pack[2] = hsimd_signmask_8(s2);\
175        b.bit_pack[3] = hsimd_signmask_8(s3);\
176        b.bit_pack[4] = hsimd_signmask_8(s4);\
177        b.bit_pack[5] = hsimd_signmask_8(s5);\
178        b.bit_pack[6] = hsimd_signmask_8(s6);\
179        b.bit_pack[7] = hsimd_signmask_8(s7);\
180        p = b.bit_block;\
181   } while (0)
182
183#define bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7) \
184  do { \
185        t0 = simd_add_8(s0, s0);\
186        t1 = simd_add_8(s1, s1);\
187        t2 = simd_add_8(s2, s2);\
188        t3 = simd_add_8(s3, s3);\
189        t4 = simd_add_8(s4, s4);\
190        t5 = simd_add_8(s5, s5);\
191        t6 = simd_add_8(s6, s6);\
192        t7 = simd_add_8(s7, s7);\
193  } while (0)
194
195
196#define s2p_movemask(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
197  do { \
198        BitBlock t0, t1, t2, t3, t4, t5, t6, t7;\
199        movemask_step(s0, s1, s2, s3, s4, s5, s6, s7, p0);\
200        bitshift_step(s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, t5, t6, t7);\
201        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p1);\
202        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
203        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p2);\
204        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
205        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p3);\
206        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
207        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p4);\
208        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
209        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p5);\
210        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
211        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p6);\
212        bitshift_step(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7);\
213        movemask_step(t0, t1, t2, t3, t4, t5, t6, t7, p7);\
214  } while (0)
215
216
217#endif // S2P_H
218
219
Note: See TracBrowser for help on using the repository browser.