source: trunk/lib_ir/p2s.ll @ 4067

Last change on this file since 4067 was 4051, checked in by linmengl, 5 years ago

add pure ir p2s, get the same performance on SSE2 now

File size: 6.2 KB
Line 
1declare <4 x i32> @ifh_1(<4 x i32> %cond, <4 x i32> %b, <4 x i32> %c)
2declare <4 x i32> @srli_16(<4 x i32> %a, <8 x i16> %shift_mask)
3declare <4 x i32> @slli_16(<4 x i32> %a, <8 x i16> %shift_mask)
4
5declare <8 x i16> @const16_1()
6declare <8 x i16> @const16_2()
7declare <8 x i16> @const16_4()
8declare <4 x i32> @himask_2()
9declare <4 x i32> @himask_4()
10declare <4 x i32> @himask_8()
11
12define <4 x i32> @mergeh_8(<4 x i32> %a, <4 x i32> %b) alwaysinline {
13entry:
14  %aa = bitcast <4 x i32> %a to <16 x i8>
15  %bb = bitcast <4 x i32> %b to <16 x i8>
16  %rr = shufflevector <16 x i8> %bb, <16 x i8> %aa, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
17
18  %rr1 = bitcast <16 x i8> %rr to <4 x i32>
19  ret <4 x i32> %rr1
20}
21
22define <4 x i32> @mergel_8(<4 x i32> %a, <4 x i32> %b) alwaysinline {
23entry:
24  %aa = bitcast <4 x i32> %a to <16 x i8>
25  %bb = bitcast <4 x i32> %b to <16 x i8>
26  %rr = shufflevector <16 x i8> %bb, <16 x i8> %aa, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
27
28  %rr1 = bitcast <16 x i8> %rr to <4 x i32>
29  ret <4 x i32> %rr1
30}
31
32define void @p2s_step_ir(<4 x i32> %p0, <4 x i32> %p1, <4 x i32> %hi_mask, <8 x i16> %shift_mask, <4 x i32>* %s0, <4 x i32>* %s1) alwaysinline {
33entry:
34  %f0 = call <4 x i32> @srli_16(<4 x i32> %p1, <8 x i16> %shift_mask)
35  %t0 = call <4 x i32> @ifh_1(<4 x i32> %hi_mask, <4 x i32> %p0, <4 x i32> %f0)
36  %f1 = call <4 x i32> @slli_16(<4 x i32> %p0, <8 x i16> %shift_mask)
37  %t1 = call <4 x i32> @ifh_1(<4 x i32> %hi_mask, <4 x i32> %f1, <4 x i32> %p1)
38
39  %r0 = call <4 x i32> @mergeh_8(<4 x i32> %t0, <4 x i32> %t1)
40  %r1 = call <4 x i32> @mergel_8(<4 x i32> %t0, <4 x i32> %t1)
41
42  store <4 x i32> %r0, <4 x i32>* %s0
43  store <4 x i32> %r1, <4 x i32>* %s1
44
45  ret void
46}
47
48define void @p2s_bytemerge_ir(<4 x i32> %p0, <4 x i32> %p1, <4 x i32> %p2, <4 x i32> %p3, <4 x i32> %p4, <4 x i32> %p5, <4 x i32> %p6, <4 x i32> %p7, <4 x i32>* %s0, <4 x i32>* %s1, <4 x i32>* %s2, <4 x i32>* %s3, <4 x i32>* %s4, <4 x i32>* %s5, <4 x i32>* %s6, <4 x i32>* %s7) {
49entry:
50  %bit00004444_0 = alloca <4 x i32>, align 16
51  %bit22226666_0 = alloca <4 x i32>, align 16
52  %bit00004444_1 = alloca <4 x i32>, align 16
53  %bit22226666_1 = alloca <4 x i32>, align 16
54  %bit11115555_0 = alloca <4 x i32>, align 16
55  %bit33337777_0 = alloca <4 x i32>, align 16
56  %bit11115555_1 = alloca <4 x i32>, align 16
57  %bit33337777_1 = alloca <4 x i32>, align 16
58  %bit00224466_0 = alloca <4 x i32>, align 16
59  %bit00224466_1 = alloca <4 x i32>, align 16
60  %bit00224466_2 = alloca <4 x i32>, align 16
61  %bit00224466_3 = alloca <4 x i32>, align 16
62  %bit11335577_0 = alloca <4 x i32>, align 16
63  %bit11335577_1 = alloca <4 x i32>, align 16
64  %bit11335577_2 = alloca <4 x i32>, align 16
65  %bit11335577_3 = alloca <4 x i32>, align 16
66
67  %call10 = call <4 x i32> @himask_8()
68  %call11 = call <8 x i16> @const16_4()
69  call void @p2s_step_ir(<4 x i32> %p0, <4 x i32> %p4, <4 x i32> %call10, <8 x i16> %call11, <4 x i32>* %bit00004444_0, <4 x i32>* %bit00004444_1)
70
71  %call14 = call <4 x i32> @himask_8()
72  %call15 = call <8 x i16> @const16_4()
73  call void @p2s_step_ir(<4 x i32> %p1, <4 x i32> %p5, <4 x i32> %call14, <8 x i16> %call15, <4 x i32>* %bit11115555_0, <4 x i32>* %bit11115555_1)
74
75  %call18 = call <4 x i32> @himask_8()
76  %call19 = call <8 x i16> @const16_4()
77  call void @p2s_step_ir(<4 x i32> %p2, <4 x i32> %p6, <4 x i32> %call18, <8 x i16> %call19, <4 x i32>* %bit22226666_0, <4 x i32>* %bit22226666_1)
78
79  %call22 = call <4 x i32> @himask_8()
80  %call23 = call <8 x i16> @const16_4()
81  call void @p2s_step_ir(<4 x i32> %p3, <4 x i32> %p7, <4 x i32> %call22, <8 x i16> %call23, <4 x i32>* %bit33337777_0, <4 x i32>* %bit33337777_1)
82
83  %p23 = load <4 x i32>* %bit00004444_0, align 16
84  %p24 = load <4 x i32>* %bit22226666_0, align 16
85  %call24 = call <4 x i32> @himask_4()
86  %call25 = call <8 x i16> @const16_2()
87  call void @p2s_step_ir(<4 x i32> %p23, <4 x i32> %p24, <4 x i32> %call24, <8 x i16> %call25, <4 x i32>* %bit00224466_0, <4 x i32>* %bit00224466_1)
88
89  %p25 = load <4 x i32>* %bit11115555_0, align 16
90  %p26 = load <4 x i32>* %bit33337777_0, align 16
91  %call26 = call <4 x i32> @himask_4()
92  %call27 = call <8 x i16> @const16_2()
93  call void @p2s_step_ir(<4 x i32> %p25, <4 x i32> %p26, <4 x i32> %call26, <8 x i16> %call27, <4 x i32>* %bit11335577_0, <4 x i32>* %bit11335577_1)
94
95  %p27 = load <4 x i32>* %bit00004444_1, align 16
96  %p28 = load <4 x i32>* %bit22226666_1, align 16
97  %call28 = call <4 x i32> @himask_4()
98  %call29 = call <8 x i16> @const16_2()
99  call void @p2s_step_ir(<4 x i32> %p27, <4 x i32> %p28, <4 x i32> %call28, <8 x i16> %call29, <4 x i32>* %bit00224466_2, <4 x i32>* %bit00224466_3)
100
101  %p29 = load <4 x i32>* %bit11115555_1, align 16
102  %p30 = load <4 x i32>* %bit33337777_1, align 16
103  %call30 = call <4 x i32> @himask_4()
104  %call31 = call <8 x i16> @const16_2()
105  call void @p2s_step_ir(<4 x i32> %p29, <4 x i32> %p30, <4 x i32> %call30, <8 x i16> %call31, <4 x i32>* %bit11335577_2, <4 x i32>* %bit11335577_3)
106
107  %p31 = load <4 x i32>* %bit00224466_0, align 16
108  %p32 = load <4 x i32>* %bit11335577_0, align 16
109  %call32 = call <4 x i32> @himask_2()
110  %call33 = call <8 x i16> @const16_1()
111  call void @p2s_step_ir(<4 x i32> %p31, <4 x i32> %p32, <4 x i32> %call32, <8 x i16> %call33, <4 x i32>* %s0, <4 x i32>* %s1)
112
113  %p33 = load <4 x i32>* %bit00224466_1, align 16
114  %p34 = load <4 x i32>* %bit11335577_1, align 16
115  %call36 = call <4 x i32> @himask_2()
116  %call37 = call <8 x i16> @const16_1()
117  call void @p2s_step_ir(<4 x i32> %p33, <4 x i32> %p34, <4 x i32> %call36, <8 x i16> %call37, <4 x i32>* %s2, <4 x i32>* %s3)
118
119  %p35 = load <4 x i32>* %bit00224466_2, align 16
120  %p36 = load <4 x i32>* %bit11335577_2, align 16
121  %call40 = call <4 x i32> @himask_2()
122  %call41 = call <8 x i16> @const16_1()
123  call void @p2s_step_ir(<4 x i32> %p35, <4 x i32> %p36, <4 x i32> %call40, <8 x i16> %call41, <4 x i32>* %s4, <4 x i32>* %s5)
124
125  %p37 = load <4 x i32>* %bit00224466_3, align 16
126  %p38 = load <4 x i32>* %bit11335577_3, align 16
127  %call44 = call <4 x i32> @himask_2()
128  %call45 = call <8 x i16> @const16_1()
129  call void @p2s_step_ir(<4 x i32> %p37, <4 x i32> %p38, <4 x i32> %call44, <8 x i16> %call45, <4 x i32>* %s6, <4 x i32>* %s7)
130
131  ret void
132}
Note: See TracBrowser for help on using the repository browser.