source: trunk/lib/p2s.h @ 613

Last change on this file since 613 was 613, checked in by lindanl, 9 years ago

Add p2s.h

File size: 4.6 KB
Line 
1/*  p2s - Serial to Parallel Bit Stream Transposition
2    Copyright (c) 2007, 2008, 2010, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4*/
5
6#ifndef P2S_H
7#define P2S_H
8
9#define BytePack SIMD_type
10#define BitBlock SIMD_type
11
12static inline void p2s(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
13                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
14                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3, 
15                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
16
17
18/* Different algorithms may be selected. */
19#ifdef USE_P2S_IDEAL
20#define P2S_ALGORITHM p2s_ideal
21#endif
22
23#ifdef USE_P2S_MOVEMASK
24#define P2S_ALGORITHM p2s_movemask  /* Not yet implemented. */
25#endif
26
27#ifndef P2S_ALGORITHM
28#define P2S_ALGORITHM p2s_bytemerge
29#endif
30
31
32
33/*  p2s_ideal is an ideal parallel to serial transposition
34    algorithm given an architecture with native support for
35    simd_merge{h,l}_{8,4,2} operations, achieving transposition
36    of 8 parallel bitblocks into 8 serial bytepacks in only 24 merge
37    operations.
38*/
39
40
41#define p2s_ideal(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7)  \
42  do { \
43        BitBlock bit01_r0,bit01_r1,bit23_r0,bit23_r1,bit45_r0,bit45_r1,bit67_r0,bit67_r1; \
44        BitBlock bit0123_r0,bit0123_r1,bit0123_r2,bit0123_r3, \
45        bit4567_r0,bit4567_r1,bit4567_r2,bit4567_r3; \
46        bit01_r0= simd_mergeh_1(p0,p1) ; \
47        bit01_r1= simd_mergel_1(p0,p1) ; \
48        bit23_r0= simd_mergeh_1(p2,p3) ; \
49        bit23_r1= simd_mergel_1(p2,p3) ; \
50        bit45_r0= simd_mergeh_1(p4,p5) ; \
51        bit45_r1= simd_mergel_1(p4,p5) ; \
52        bit67_r0= simd_mergeh_1(p6,p7) ; \
53        bit67_r1= simd_mergel_1(p6,p7) ; \
54        bit0123_r0= simd_mergeh_2(bit01_r0,bit23_r0) ; \
55        bit0123_r1= simd_mergel_2(bit01_r0,bit23_r0) ; \
56        bit0123_r2= simd_mergeh_2(bit01_r1,bit23_r1) ; \
57        bit0123_r3= simd_mergel_2(bit01_r1,bit23_r1) ; \
58        bit4567_r0= simd_mergeh_2(bit45_r0,bit67_r0) ; \
59        bit4567_r1= simd_mergel_2(bit45_r0,bit67_r0) ; \
60        bit4567_r2= simd_mergeh_2(bit45_r1,bit67_r1) ; \
61        bit4567_r3= simd_mergel_2(bit45_r1,bit67_r1) ; \
62        s0= simd_mergeh_4(bit0123_r0,bit4567_r0) ; \
63        s1= simd_mergel_4(bit0123_r0,bit4567_r0) ; \
64        s2= simd_mergeh_4(bit0123_r1,bit4567_r1) ; \
65        s3= simd_mergel_4(bit0123_r1,bit4567_r1) ; \
66        s4= simd_mergeh_4(bit0123_r2,bit4567_r2) ; \
67        s5= simd_mergel_4(bit0123_r2,bit4567_r2) ; \
68        s6= simd_mergeh_4(bit0123_r3,bit4567_r3) ; \
69        s7= simd_mergel_4(bit0123_r3,bit4567_r3) ; \
70  } while(0)
71
72/*  p2s_bytemerge is a fast parallel to serial transposition
73    algorithm given an architecture with simd_merge{h,l}_8 operations,
74    but not at small field widths.
75    MMX, SSE, Altivec ...
76*/
77 
78#define p2s_step(p0,p1,hi_mask,shift,s0,s1)  \
79  do { \
80        BitBlock t0,t1; \
81        t0= simd_if(hi_mask,p0,simd_srli_16(p1,shift) ) ; \
82        t1= simd_if(hi_mask,simd_slli_16(p0,shift) ,p1) ; \
83        s0= simd_mergeh_8(t0,t1) ; \
84        s1= simd_mergel_8(t0,t1) ; \
85  } while(0)
86
87#define p2s_bytemerge(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7)  \
88  do { \
89        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1; \
90        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1; \
91        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3; \
92        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3; \
93        p2s_step(p0,p4,simd_himask_8,4,bit00004444_0,bit00004444_1);  \
94        p2s_step(p1,p5,simd_himask_8,4,bit11115555_0,bit11115555_1);  \
95        p2s_step(p2,p6,simd_himask_8,4,bit22226666_0,bit22226666_1);  \
96        p2s_step(p3,p7,simd_himask_8,4,bit33337777_0,bit33337777_1);  \
97        p2s_step(bit00004444_0,bit22226666_0,simd_himask_4,2,bit00224466_0,bit00224466_1);  \
98        p2s_step(bit11115555_0,bit33337777_0,simd_himask_4,2,bit11335577_0,bit11335577_1);  \
99        p2s_step(bit00004444_1,bit22226666_1,simd_himask_4,2,bit00224466_2,bit00224466_3);  \
100        p2s_step(bit11115555_1,bit33337777_1,simd_himask_4,2,bit11335577_2,bit11335577_3);  \
101        p2s_step(bit00224466_0,bit11335577_0,simd_himask_2,1,s0,s1);  \
102        p2s_step(bit00224466_1,bit11335577_1,simd_himask_2,1,s2,s3);  \
103        p2s_step(bit00224466_2,bit11335577_2,simd_himask_2,1,s4,s5);  \
104        p2s_step(bit00224466_3,bit11335577_3,simd_himask_2,1,s6,s7);  \
105  } while(0)
106
107
108inline void p2s(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
109                       BytePack s4, BytePack s5, BytePack s6, BytePack s7,
110                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3, 
111                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7) {
112
113#if (BYTE_ORDER == BIG_ENDIAN)
114  P2S_ALGORITHM(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7);
115#endif
116#if (BYTE_ORDER == LITTLE_ENDIAN)
117  P2S_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7);
118#endif
119}
120
121#endif
122
Note: See TracBrowser for help on using the repository browser.