1 | /* p2s - Serial to Parallel Bit Stream Transposition |
---|
2 | Copyright (c) 2007, 2008, 2010, Robert D. Cameron. |
---|
3 | Licensed to the public under the Open Software License 3.0. |
---|
4 | Licensed to International Characters Inc. |
---|
5 | under the Academic Free License version 3.0. |
---|
6 | |
---|
7 | */ |
---|
8 | |
---|
9 | #ifndef P2S_HPP |
---|
10 | #define P2S_HPP |
---|
11 | |
---|
12 | #include "idisa128.hpp" |
---|
13 | |
---|
14 | #define BytePack BitBlock |
---|
15 | |
---|
16 | /* |
---|
17 | /* Given 8 parallel bitstream blocks p0, p1, ..., p7, inverse transpose |
---|
18 | the data into a block of bytes in 8 consecutive registers s0, s1, ..., s7. |
---|
19 | |
---|
20 | The following header shows the intent, although a macro is used for |
---|
21 | speed. |
---|
22 | static inline void p2s(BitBlock p0, BitBlock p1, BitBlock p2, BitBlock p3, |
---|
23 | BitBlock p4, BitBlock p5, BitBlock p6, BitBlock p7, |
---|
24 | BytePack& s0, BytePack& s1, BytePack& s2, BytePack& s3, |
---|
25 | BytePack& s5, BytePack& s6, BytePack& s7, BytePack& s8, |
---|
26 | ); |
---|
27 | |
---|
28 | */ |
---|
29 | |
---|
30 | /* Different algorithms may be selected. */ |
---|
31 | #ifdef USE_P2S_IDEAL |
---|
32 | #define P2S_ALGORITHM p2s_ideal |
---|
33 | #endif |
---|
34 | |
---|
35 | #ifndef P2S_ALGORITHM |
---|
36 | #define P2S_ALGORITHM p2s_bytemerge |
---|
37 | #endif |
---|
38 | |
---|
39 | /* p2s_ideal is an ideal parallel to serial transposition |
---|
40 | algorithm given an architecture with native support for |
---|
41 | esimd<{4,2,1}>::merge{h,l} operations, achieving transposition |
---|
42 | of 8 parallel bitblocks into 8 serial bytepacks in only 24 merge |
---|
43 | operations. |
---|
44 | */ |
---|
45 | |
---|
46 | #define p2s_ideal(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7) \ |
---|
47 | do { \ |
---|
48 | BitBlock bit01_r0,bit01_r1,bit23_r0,bit23_r1,bit45_r0,bit45_r1,bit67_r0,bit67_r1; \ |
---|
49 | BitBlock bit0123_r0,bit0123_r1,bit0123_r2,bit0123_r3, \ |
---|
50 | bit4567_r0,bit4567_r1,bit4567_r2,bit4567_r3; \ |
---|
51 | bit01_r0= esimd<1>::mergeh(p0,p1) ; \ |
---|
52 | bit01_r1= esimd<1>::mergel(p0,p1) ; \ |
---|
53 | bit23_r0= esimd<1>::mergeh(p2,p3) ; \ |
---|
54 | bit23_r1= esimd<1>::mergel(p2,p3) ; \ |
---|
55 | bit45_r0= esimd<1>::mergeh(p4,p5) ; \ |
---|
56 | bit45_r1= esimd<1>::mergel(p4,p5) ; \ |
---|
57 | bit67_r0= esimd<1>::mergeh(p6,p7) ; \ |
---|
58 | bit67_r1= esimd<1>::mergel(p6,p7) ; \ |
---|
59 | bit0123_r0= esimd<2>::mergeh(bit01_r0,bit23_r0) ; \ |
---|
60 | bit0123_r1= esimd<2>::mergel(bit01_r0,bit23_r0) ; \ |
---|
61 | bit0123_r2= esimd<2>::mergeh(bit01_r1,bit23_r1) ; \ |
---|
62 | bit0123_r3= esimd<2>::mergel(bit01_r1,bit23_r1) ; \ |
---|
63 | bit4567_r0= esimd<2>::mergeh(bit45_r0,bit67_r0) ; \ |
---|
64 | bit4567_r1= esimd<2>::mergel(bit45_r0,bit67_r0) ; \ |
---|
65 | bit4567_r2= esimd<2>::mergeh(bit45_r1,bit67_r1) ; \ |
---|
66 | bit4567_r3= esimd<2>::mergel(bit45_r1,bit67_r1) ; \ |
---|
67 | s0= esimd<4>::mergeh(bit0123_r0,bit4567_r0) ; \ |
---|
68 | s1= esimd<4>::mergel(bit0123_r0,bit4567_r0) ; \ |
---|
69 | s2= esimd<4>::mergeh(bit0123_r1,bit4567_r1) ; \ |
---|
70 | s3= esimd<4>::mergel(bit0123_r1,bit4567_r1) ; \ |
---|
71 | s4= esimd<4>::mergeh(bit0123_r2,bit4567_r2) ; \ |
---|
72 | s5= esimd<4>::mergel(bit0123_r2,bit4567_r2) ; \ |
---|
73 | s6= esimd<4>::mergeh(bit0123_r3,bit4567_r3) ; \ |
---|
74 | s7= esimd<4>::mergel(bit0123_r3,bit4567_r3) ; \ |
---|
75 | } while(0) |
---|
76 | |
---|
77 | /* p2s_bytemerge is a fast parallel to serial transposition |
---|
78 | algorithm given an architecture with esimd<8>::merge{h,l}, |
---|
79 | but not at small field widths. |
---|
80 | MMX, SSE, Altivec ... |
---|
81 | */ |
---|
82 | |
---|
83 | #define p2s_step(p0,p1,hi_mask,shift,s0,s1) \ |
---|
84 | do { \ |
---|
85 | BitBlock t0,t1; \ |
---|
86 | t0= simd<1>::ifh(hi_mask,p0,simd<16>::srli<shift>(p1)) ; \ |
---|
87 | t1= simd<1>::ifh(hi_mask,simd<16>::slli<shift>(p0),p1) ; \ |
---|
88 | s0= esimd<8>::mergeh(t0,t1) ; \ |
---|
89 | s1= esimd<8>::mergel(t0,t1) ; \ |
---|
90 | } while(0) |
---|
91 | |
---|
92 | #define p2s_bytemerge(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7) \ |
---|
93 | do { \ |
---|
94 | BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1; \ |
---|
95 | BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1; \ |
---|
96 | BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3; \ |
---|
97 | BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3; \ |
---|
98 | p2s_step(p0,p4,simd<8>::himask(),4,bit00004444_0,bit00004444_1); \ |
---|
99 | p2s_step(p1,p5,simd<8>::himask(),4,bit11115555_0,bit11115555_1); \ |
---|
100 | p2s_step(p2,p6,simd<8>::himask(),4,bit22226666_0,bit22226666_1); \ |
---|
101 | p2s_step(p3,p7,simd<8>::himask(),4,bit33337777_0,bit33337777_1); \ |
---|
102 | p2s_step(bit00004444_0,bit22226666_0,simd<4>::himask(),2,bit00224466_0,bit00224466_1); \ |
---|
103 | p2s_step(bit11115555_0,bit33337777_0,simd<4>::himask(),2,bit11335577_0,bit11335577_1); \ |
---|
104 | p2s_step(bit00004444_1,bit22226666_1,simd<4>::himask(),2,bit00224466_2,bit00224466_3); \ |
---|
105 | p2s_step(bit11115555_1,bit33337777_1,simd<4>::himask(),2,bit11335577_2,bit11335577_3); \ |
---|
106 | p2s_step(bit00224466_0,bit11335577_0,simd<2>::himask(),1,s0,s1); \ |
---|
107 | p2s_step(bit00224466_1,bit11335577_1,simd<2>::himask(),1,s2,s3); \ |
---|
108 | p2s_step(bit00224466_2,bit11335577_2,simd<2>::himask(),1,s4,s5); \ |
---|
109 | p2s_step(bit00224466_3,bit11335577_3,simd<2>::himask(),1,s6,s7); \ |
---|
110 | } while(0) |
---|
111 | |
---|
112 | #define p2s(p0, p1, p2, p3, p4, p5, p6, p7, s0, s1, s2, s3, s4, s5, s6, s7)\ |
---|
113 | P2S_ALGORITHM(p0, p1, p2, p3, p4, p5, p6, p7, s7, s6, s5, s4, s3, s2, s1, s0) |
---|
114 | |
---|
115 | #endif // P2S_HPP |
---|
116 | |
---|