source: trunk/lib_c/p2s.h @ 3391

Last change on this file since 3391 was 3391, checked in by linmengl, 6 years ago

check in IDISA C library and other support libraries. Some template features still remain.

File size: 4.6 KB
Line 
1/* Generated by cpp2c.rb from ./p2s.hpp
2 * Use IDISA C support
3*/
4
5/*  p2s - Serial to Parallel Bit Stream Transposition
6    Copyright (c) 2007, 2008, 2010, Robert D. Cameron.
7    Licensed to the public under the Open Software License 3.0.
8    Licensed to International Characters Inc.
9       under the Academic Free License version 3.0.
10
11*/
12
13#ifndef P2S_H
14#define P2S_H
15
16#include "idisa128_c.h"
17
18#define BytePack BitBlock
19
20/*
21/* Given 8 parallel bitstream blocks p0, p1, ..., p7, inverse transpose
22   the data into a block of bytes in 8 consecutive registers s0, s1, ..., s7.
23
24   The following header shows the intent, although a macro is used for
25   speed.
26static inline void p2s(BitBlock p0, BitBlock p1, BitBlock p2, BitBlock p3,
27                       BitBlock p4, BitBlock p5, BitBlock p6, BitBlock p7,
28                       BytePack& s0, BytePack& s1, BytePack& s2, BytePack& s3,
29                       BytePack& s5, BytePack& s6, BytePack& s7, BytePack& s8,
30                       );
31
32*/
33
34/* Different algorithms may be selected. */
35#ifdef USE_P2S_IDEAL
36#define P2S_ALGORITHM p2s_ideal
37#endif
38
39#ifndef P2S_ALGORITHM
40#define P2S_ALGORITHM p2s_bytemerge
41#endif
42
43/*  p2s_ideal is an ideal parallel to serial transposition
44    algorithm given an architecture with native support for
45    esimd<{4,2,1}>::merge{h,l} operations, achieving transposition
46    of 8 parallel bitblocks into 8 serial bytepacks in only 24 merge
47    operations.
48*/
49
50#define p2s_ideal(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7)  \
51  do { \
52        BitBlock bit01_r0,bit01_r1,bit23_r0,bit23_r1,bit45_r0,bit45_r1,bit67_r0,bit67_r1; \
53        BitBlock bit0123_r0,bit0123_r1,bit0123_r2,bit0123_r3, \
54        bit4567_r0,bit4567_r1,bit4567_r2,bit4567_r3; \
55        bit01_r0= esimd_mergeh_1(p0,p1) ; \
56        bit01_r1= esimd_mergel_1(p0,p1) ; \
57        bit23_r0= esimd_mergeh_1(p2,p3) ; \
58        bit23_r1= esimd_mergel_1(p2,p3) ; \
59        bit45_r0= esimd_mergeh_1(p4,p5) ; \
60        bit45_r1= esimd_mergel_1(p4,p5) ; \
61        bit67_r0= esimd_mergeh_1(p6,p7) ; \
62        bit67_r1= esimd_mergel_1(p6,p7) ; \
63        bit0123_r0= esimd_mergeh_2(bit01_r0,bit23_r0) ; \
64        bit0123_r1= esimd_mergel_2(bit01_r0,bit23_r0) ; \
65        bit0123_r2= esimd_mergeh_2(bit01_r1,bit23_r1) ; \
66        bit0123_r3= esimd_mergel_2(bit01_r1,bit23_r1) ; \
67        bit4567_r0= esimd_mergeh_2(bit45_r0,bit67_r0) ; \
68        bit4567_r1= esimd_mergel_2(bit45_r0,bit67_r0) ; \
69        bit4567_r2= esimd_mergeh_2(bit45_r1,bit67_r1) ; \
70        bit4567_r3= esimd_mergel_2(bit45_r1,bit67_r1) ; \
71        s0= esimd_mergeh_4(bit0123_r0,bit4567_r0) ; \
72        s1= esimd_mergel_4(bit0123_r0,bit4567_r0) ; \
73        s2= esimd_mergeh_4(bit0123_r1,bit4567_r1) ; \
74        s3= esimd_mergel_4(bit0123_r1,bit4567_r1) ; \
75        s4= esimd_mergeh_4(bit0123_r2,bit4567_r2) ; \
76        s5= esimd_mergel_4(bit0123_r2,bit4567_r2) ; \
77        s6= esimd_mergeh_4(bit0123_r3,bit4567_r3) ; \
78        s7= esimd_mergel_4(bit0123_r3,bit4567_r3) ; \
79  } while(0)
80
81/*  p2s_bytemerge is a fast parallel to serial transposition
82    algorithm given an architecture with esimd<8>::merge{h,l},
83    but not at small field widths.
84    MMX, SSE, Altivec ...
85*/
86 
87#define p2s_step(p0,p1,hi_mask,shift,s0,s1)  \
88  do { \
89        BitBlock t0,t1; \
90        t0= simd_ifh_1(hi_mask,p0,simd_srli_16(shift, p1)) ; \
91        t1= simd_ifh_1(hi_mask,simd_slli_16(shift, p0),p1) ; \
92        s0= esimd_mergeh_8(t0,t1) ; \
93        s1= esimd_mergel_8(t0,t1) ; \
94  } while(0)
95
96#define p2s_bytemerge(p0,p1,p2,p3,p4,p5,p6,p7,s0,s1,s2,s3,s4,s5,s6,s7)  \
97  do { \
98        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1; \
99        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1; \
100        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3; \
101        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3; \
102        p2s_step(p0,p4,simd_himask_8(),4,bit00004444_0,bit00004444_1);  \
103        p2s_step(p1,p5,simd_himask_8(),4,bit11115555_0,bit11115555_1);  \
104        p2s_step(p2,p6,simd_himask_8(),4,bit22226666_0,bit22226666_1);  \
105        p2s_step(p3,p7,simd_himask_8(),4,bit33337777_0,bit33337777_1);  \
106        p2s_step(bit00004444_0,bit22226666_0,simd_himask_4(),2,bit00224466_0,bit00224466_1);  \
107        p2s_step(bit11115555_0,bit33337777_0,simd_himask_4(),2,bit11335577_0,bit11335577_1);  \
108        p2s_step(bit00004444_1,bit22226666_1,simd_himask_4(),2,bit00224466_2,bit00224466_3);  \
109        p2s_step(bit11115555_1,bit33337777_1,simd_himask_4(),2,bit11335577_2,bit11335577_3);  \
110        p2s_step(bit00224466_0,bit11335577_0,simd_himask_2(),1,s0,s1);  \
111        p2s_step(bit00224466_1,bit11335577_1,simd_himask_2(),1,s2,s3);  \
112        p2s_step(bit00224466_2,bit11335577_2,simd_himask_2(),1,s4,s5);  \
113        p2s_step(bit00224466_3,bit11335577_3,simd_himask_2(),1,s6,s7);  \
114  } while(0)
115
116#define p2s(p0, p1, p2, p3, p4, p5, p6, p7, s0, s1, s2, s3, s4, s5, s6, s7)\
117  P2S_ALGORITHM(p0, p1, p2, p3, p4, p5, p6, p7, s7, s6, s5, s4, s3, s2, s1, s0)
118
119#endif // P2S_H
120
121
Note: See TracBrowser for help on using the repository browser.