[2719] | 1 | /* p2s - Serial to Parallel Bit Stream Transposition |
---|
[1589] | 2 | Copyright (c) 2007, 2008, 2010, Robert D. Cameron. |
---|
| 3 | Licensed to the public under the Open Software License 3.0. |
---|
[1753] | 4 | Licensed to International Characters Inc. |
---|
| 5 | under the Academic Free License version 3.0. |
---|
| 6 | |
---|
[1589] | 7 | */ |
---|
| 8 | |
---|
[1760] | 9 | #ifndef P2S_HPP |
---|
| 10 | #define P2S_HPP |
---|
[1589] | 11 | |
---|
[1753] | 12 | #include "idisa128.hpp" |
---|
[1589] | 13 | |
---|
| 14 | #define BytePack BitBlock |
---|
| 15 | |
---|
| 16 | /* |
---|
| 17 | /* Given 8 parallel bitstream blocks p0, p1, ..., p7, inverse transpose |
---|
| 18 | the data into a block of bytes in 8 consecutive registers s0, s1, ..., s7. |
---|
| 19 | |
---|
| 20 | The following header shows the intent, although a macro is used for |
---|
| 21 | speed. |
---|
| 22 | static inline void p2s(BitBlock p0, BitBlock p1, BitBlock p2, BitBlock p3, |
---|
| 23 | BitBlock p4, BitBlock p5, BitBlock p6, BitBlock p7, |
---|
| 24 | BytePack& s0, BytePack& s1, BytePack& s2, BytePack& s3, |
---|
| 25 | BytePack& s5, BytePack& s6, BytePack& s7, BytePack& s8, |
---|
| 26 | ); |
---|
| 27 | |
---|
| 28 | */ |
---|
| 29 | |
---|
| 30 | /* Different algorithms may be selected. */ |
---|
| 31 | #ifdef USE_P2S_IDEAL |
---|
| 32 | #define P2S_ALGORITHM p2s_ideal |
---|
| 33 | #endif |
---|
| 34 | |
---|
| 35 | #ifndef P2S_ALGORITHM |
---|
| 36 | #define P2S_ALGORITHM p2s_bytemerge |
---|
| 37 | #endif |
---|
| 38 | |
---|
| 39 | /* p2s_ideal is an ideal parallel to serial transposition |
---|
| 40 | algorithm given an architecture with native support for |
---|
| 41 | esimd<{4,2,1}>::merge{h,l} operations, achieving transposition |
---|
| 42 | of 8 parallel bitblocks into 8 serial bytepacks in only 24 merge |
---|
| 43 | operations. |
---|
| 44 | */ |
---|
| 45 | |
---|
| 46 | #define p2s_ideal(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7) \ |
---|
| 47 | do { \ |
---|
| 48 | BitBlock bit01_r0,bit01_r1,bit23_r0,bit23_r1,bit45_r0,bit45_r1,bit67_r0,bit67_r1; \ |
---|
| 49 | BitBlock bit0123_r0,bit0123_r1,bit0123_r2,bit0123_r3, \ |
---|
| 50 | bit4567_r0,bit4567_r1,bit4567_r2,bit4567_r3; \ |
---|
| 51 | bit01_r0= esimd<1>::mergeh(p0,p1) ; \ |
---|
| 52 | bit01_r1= esimd<1>::mergel(p0,p1) ; \ |
---|
| 53 | bit23_r0= esimd<1>::mergeh(p2,p3) ; \ |
---|
| 54 | bit23_r1= esimd<1>::mergel(p2,p3) ; \ |
---|
| 55 | bit45_r0= esimd<1>::mergeh(p4,p5) ; \ |
---|
| 56 | bit45_r1= esimd<1>::mergel(p4,p5) ; \ |
---|
| 57 | bit67_r0= esimd<1>::mergeh(p6,p7) ; \ |
---|
| 58 | bit67_r1= esimd<1>::mergel(p6,p7) ; \ |
---|
| 59 | bit0123_r0= esimd<2>::mergeh(bit01_r0,bit23_r0) ; \ |
---|
| 60 | bit0123_r1= esimd<2>::mergel(bit01_r0,bit23_r0) ; \ |
---|
| 61 | bit0123_r2= esimd<2>::mergeh(bit01_r1,bit23_r1) ; \ |
---|
| 62 | bit0123_r3= esimd<2>::mergel(bit01_r1,bit23_r1) ; \ |
---|
| 63 | bit4567_r0= esimd<2>::mergeh(bit45_r0,bit67_r0) ; \ |
---|
| 64 | bit4567_r1= esimd<2>::mergel(bit45_r0,bit67_r0) ; \ |
---|
| 65 | bit4567_r2= esimd<2>::mergeh(bit45_r1,bit67_r1) ; \ |
---|
| 66 | bit4567_r3= esimd<2>::mergel(bit45_r1,bit67_r1) ; \ |
---|
| 67 | s0= esimd<4>::mergeh(bit0123_r0,bit4567_r0) ; \ |
---|
| 68 | s1= esimd<4>::mergel(bit0123_r0,bit4567_r0) ; \ |
---|
| 69 | s2= esimd<4>::mergeh(bit0123_r1,bit4567_r1) ; \ |
---|
| 70 | s3= esimd<4>::mergel(bit0123_r1,bit4567_r1) ; \ |
---|
| 71 | s4= esimd<4>::mergeh(bit0123_r2,bit4567_r2) ; \ |
---|
| 72 | s5= esimd<4>::mergel(bit0123_r2,bit4567_r2) ; \ |
---|
| 73 | s6= esimd<4>::mergeh(bit0123_r3,bit4567_r3) ; \ |
---|
| 74 | s7= esimd<4>::mergel(bit0123_r3,bit4567_r3) ; \ |
---|
| 75 | } while(0) |
---|
| 76 | |
---|
| 77 | /* p2s_bytemerge is a fast parallel to serial transposition |
---|
| 78 | algorithm given an architecture with esimd<8>::merge{h,l}, |
---|
| 79 | but not at small field widths. |
---|
| 80 | MMX, SSE, Altivec ... |
---|
| 81 | */ |
---|
| 82 | |
---|
| 83 | #define p2s_step(p0,p1,hi_mask,shift,s0,s1) \ |
---|
| 84 | do { \ |
---|
| 85 | BitBlock t0,t1; \ |
---|
| 86 | t0= simd<1>::ifh(hi_mask,p0,simd<16>::srli<shift>(p1)) ; \ |
---|
| 87 | t1= simd<1>::ifh(hi_mask,simd<16>::slli<shift>(p0),p1) ; \ |
---|
| 88 | s0= esimd<8>::mergeh(t0,t1) ; \ |
---|
| 89 | s1= esimd<8>::mergel(t0,t1) ; \ |
---|
| 90 | } while(0) |
---|
| 91 | |
---|
| 92 | #define p2s_bytemerge(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7) \ |
---|
| 93 | do { \ |
---|
| 94 | BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1; \ |
---|
| 95 | BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1; \ |
---|
| 96 | BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3; \ |
---|
| 97 | BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3; \ |
---|
| 98 | p2s_step(p0,p4,simd<8>::himask(),4,bit00004444_0,bit00004444_1); \ |
---|
| 99 | p2s_step(p1,p5,simd<8>::himask(),4,bit11115555_0,bit11115555_1); \ |
---|
| 100 | p2s_step(p2,p6,simd<8>::himask(),4,bit22226666_0,bit22226666_1); \ |
---|
| 101 | p2s_step(p3,p7,simd<8>::himask(),4,bit33337777_0,bit33337777_1); \ |
---|
| 102 | p2s_step(bit00004444_0,bit22226666_0,simd<4>::himask(),2,bit00224466_0,bit00224466_1); \ |
---|
| 103 | p2s_step(bit11115555_0,bit33337777_0,simd<4>::himask(),2,bit11335577_0,bit11335577_1); \ |
---|
| 104 | p2s_step(bit00004444_1,bit22226666_1,simd<4>::himask(),2,bit00224466_2,bit00224466_3); \ |
---|
| 105 | p2s_step(bit11115555_1,bit33337777_1,simd<4>::himask(),2,bit11335577_2,bit11335577_3); \ |
---|
| 106 | p2s_step(bit00224466_0,bit11335577_0,simd<2>::himask(),1,s0,s1); \ |
---|
| 107 | p2s_step(bit00224466_1,bit11335577_1,simd<2>::himask(),1,s2,s3); \ |
---|
| 108 | p2s_step(bit00224466_2,bit11335577_2,simd<2>::himask(),1,s4,s5); \ |
---|
| 109 | p2s_step(bit00224466_3,bit11335577_3,simd<2>::himask(),1,s6,s7); \ |
---|
| 110 | } while(0) |
---|
| 111 | |
---|
| 112 | #define p2s(p0, p1, p2, p3, p4, p5, p6, p7, s0, s1, s2, s3, s4, s5, s6, s7)\ |
---|
| 113 | P2S_ALGORITHM(p0, p1, p2, p3, p4, p5, p6, p7, s7, s6, s5, s4, s3, s2, s1, s0) |
---|
| 114 | |
---|
[1760] | 115 | #endif // P2S_HPP |
---|
[1589] | 116 | |
---|