source: trunk/lib/s2p.h @ 619

Last change on this file since 619 was 619, checked in by cameron, 9 years ago

Extra } deleted

File size: 4.6 KB
Line 
1/*  s2p - Serial to Parallel Bit Stream Transposition
2    Copyright (c) 2007, 2008, 2010, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4*/
5
6#ifndef S2P_H
7#define S2P_H
8
9#define BytePack SIMD_type
10#define BitBlock SIMD_type
11
12/* Given a block of bytes in 8 consecutive registers s0, s1, ..., s7,
13   s2p transposes the block into 8 parallel bitstream blocks p0, p1, ..., p7.
14
15   The following header shows the intent, although a macro is used for
16   speed.
17static inline void s2p(BytePack s0, BytePack s1, BytePack s2, BytePack s3,
18                       BytePack s5, BytePack s6, BytePack s7, BytePack s8,
19                       BitBlock& p0, BitBlock& p1, BitBlock& p2, BitBlock& p3,
20                       BitBlock& p4, BitBlock& p5, BitBlock& p6, BitBlock& p7);
21*/
22
23/* Different algorithms may be selected. */
24#ifdef USE_S2P_IDEAL
25#define S2P_ALGORITHM s2p_ideal
26#endif
27
28#ifdef USE_S2P_MOVEMASK
29#define S2P_ALGORITHM s2p_movemask  /* Not yet implemented. */
30#endif
31
32#ifndef S2P_ALGORITHM
33#define S2P_ALGORITHM s2p_bytepack
34#endif
35
36
37
38/*  s2p_ideal is an ideal serial to parallel transposition
39    algorithm given an architecture with native support for
40    simd_pack_{8,4,2}_{hh,ll} operations, achieving transposition
41    of 8 serial bytepacks into 8 parallel bitblocks in only 24 pack
42    operations.
43*/
44
45#define s2p_ideal(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
46  do {\
47        BitBlock bit0123_0, bit0123_1, bit0123_2, bit0123_3,\
48        bit4567_0, bit4567_1, bit4567_2, bit4567_3;\
49        BitBlock bit01_0, bit01_1, bit23_0, bit23_1, bit45_0, bit45_1, bit67_0, bit67_1;\
50        bit0123_0 = simd_pack_8_hh(s0, s1);\
51        bit0123_1 = simd_pack_8_hh(s2, s3);\
52        bit0123_2 = simd_pack_8_hh(s4, s5);\
53        bit0123_3 = simd_pack_8_hh(s6, s7);\
54        bit4567_0 = simd_pack_8_ll(s0, s1);\
55        bit4567_1 = simd_pack_8_ll(s2, s3);\
56        bit4567_2 = simd_pack_8_ll(s4, s5);\
57        bit4567_3 = simd_pack_8_ll(s6, s7);\
58        bit01_0 = simd_pack_4_hh(bit0123_0, bit0123_1);\
59        bit01_1 = simd_pack_4_hh(bit0123_2, bit0123_3);\
60        bit23_0 = simd_pack_4_ll(bit0123_0, bit0123_1);\
61        bit23_1 = simd_pack_4_ll(bit0123_2, bit0123_3);\
62        bit45_0 = simd_pack_4_hh(bit4567_0, bit4567_1);\
63        bit45_1 = simd_pack_4_hh(bit4567_2, bit4567_3);\
64        bit67_0 = simd_pack_4_ll(bit4567_0, bit4567_1);\
65        bit67_1 = simd_pack_4_ll(bit4567_2, bit4567_3);\
66        p0 = simd_pack_2_hh(bit01_0, bit01_1);\
67        p1 = simd_pack_2_ll(bit01_0, bit01_1);\
68        p2 = simd_pack_2_hh(bit23_0, bit23_1);\
69        p3 = simd_pack_2_ll(bit23_0, bit23_1);\
70        p4 = simd_pack_2_hh(bit45_0, bit45_1);\
71        p5 = simd_pack_2_ll(bit45_0, bit45_1);\
72        p6 = simd_pack_2_hh(bit67_0, bit67_1);\
73        p7 = simd_pack_2_ll(bit67_0, bit67_1);\
74  } while(0)
75
76
77/*  s2p_bytepack is a fast serial to parallel transposition
78    algorithm given an architecture with simd_pack_16 operations,
79    but not at small field widths.
80    MMX, SSE, Altivec ...
81*/
82
83#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
84  do {\
85        BitBlock t0,t1;\
86        t0 = simd_pack_16_hh(s0, s1);\
87        t1 = simd_pack_16_ll(s0, s1);\
88        p0 = simd_if(hi_mask, t0, simd_srli_16(t1, shift));\
89        p1 = simd_if(hi_mask, simd_slli_16(t0, shift), t1);\
90  } while(0)
91
92#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
93  do {\
94        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
95        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
96        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
97        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
98        s2p_step(s0,s1,simd_himask_2,1,bit00224466_0,bit11335577_0);\
99        s2p_step(s2,s3,simd_himask_2,1,bit00224466_1,bit11335577_1);\
100        s2p_step(s4,s5,simd_himask_2,1,bit00224466_2,bit11335577_2);\
101        s2p_step(s6,s7,simd_himask_2,1,bit00224466_3,bit11335577_3);\
102        s2p_step(bit00224466_0,bit00224466_1,simd_himask_4,2,bit00004444_0,bit22226666_0);\
103        s2p_step(bit00224466_2,bit00224466_3,simd_himask_4,2,bit00004444_1,bit22226666_1);\
104        s2p_step(bit11335577_0,bit11335577_1,simd_himask_4,2,bit11115555_0,bit33337777_0);\
105        s2p_step(bit11335577_2,bit11335577_3,simd_himask_4,2,bit11115555_1,bit33337777_1);\
106        s2p_step(bit00004444_0,bit00004444_1,simd_himask_8,4,p0,p4);\
107        s2p_step(bit11115555_0,bit11115555_1,simd_himask_8,4,p1,p5);\
108        s2p_step(bit22226666_0,bit22226666_1,simd_himask_8,4,p2,p6);\
109        s2p_step(bit33337777_0,bit33337777_1,simd_himask_8,4,p3,p7);\
110  } while(0)
111
112#if (BYTE_ORDER == BIG_ENDIAN)
113#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
114  S2P_ALGORITHM(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)
115#endif
116#if (BYTE_ORDER == LITTLE_ENDIAN)
117#define s2p(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7)\
118  S2P_ALGORITHM(s7, s6, s5, s4, s3, s2, s1, s0, p0, p1, p2, p3, p4, p5, p6, p7)
119#endif
120
121#endif
122
Note: See TracBrowser for help on using the repository browser.