Changeset 3933 for trunk/lib_ir/s2p.ll


Ignore:
Timestamp:
Jul 28, 2014, 1:03:33 PM (5 years ago)
Author:
linmengl
Message:

IR inline s2p_bytepack, get the same perf result of SSE2 now.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib_ir/s2p.ll

    r3927 r3933  
    105105}
    106106
    107 
     107define void @s2p_bytepack_ir(<4 x i32> %s0, <4 x i32> %s1, <4 x i32> %s2, <4 x i32> %s3, <4 x i32> %s4, <4 x i32> %s5, <4 x i32> %s6, <4 x i32> %s7, <4 x i32>* %p0, <4 x i32>* %p1, <4 x i32>* %p2, <4 x i32>* %p3, <4 x i32>* %p4, <4 x i32>* %p5, <4 x i32>* %p6, <4 x i32>* %p7) {
     108entry:
     109  %bit00224466_0 = alloca <4 x i32>, align 16
     110  %bit00224466_1 = alloca <4 x i32>, align 16
     111  %bit00224466_2 = alloca <4 x i32>, align 16
     112  %bit00224466_3 = alloca <4 x i32>, align 16
     113  %bit11335577_0 = alloca <4 x i32>, align 16
     114  %bit11335577_1 = alloca <4 x i32>, align 16
     115  %bit11335577_2 = alloca <4 x i32>, align 16
     116  %bit11335577_3 = alloca <4 x i32>, align 16
     117  %bit00004444_0 = alloca <4 x i32>, align 16
     118  %bit22226666_0 = alloca <4 x i32>, align 16
     119  %bit00004444_1 = alloca <4 x i32>, align 16
     120  %bit22226666_1 = alloca <4 x i32>, align 16
     121  %bit11115555_0 = alloca <4 x i32>, align 16
     122  %bit33337777_0 = alloca <4 x i32>, align 16
     123  %bit11115555_1 = alloca <4 x i32>, align 16
     124  %bit33337777_1 = alloca <4 x i32>, align 16
     125
     126  %call10 = call <4 x i32> @himask_2()
     127  %call11 = call <8 x i16> @const16_1()
     128  call void @s2p_step_ir(<4 x i32> %s0, <4 x i32> %s1, <4 x i32> %call10, <8 x i16> %call11, <4 x i32>* %bit00224466_0, <4 x i32>* %bit11335577_0)
     129  %call14 = call <4 x i32> @himask_2()
     130  %call15 = call <8 x i16> @const16_1()
     131  call void @s2p_step_ir(<4 x i32> %s2, <4 x i32> %s3, <4 x i32> %call14, <8 x i16> %call15, <4 x i32>* %bit00224466_1, <4 x i32>* %bit11335577_1)
     132  %call18 = call <4 x i32> @himask_2()
     133  %call19 = call <8 x i16> @const16_1()
     134  call void @s2p_step_ir(<4 x i32> %s4, <4 x i32> %s5, <4 x i32> %call18, <8 x i16> %call19, <4 x i32>* %bit00224466_2, <4 x i32>* %bit11335577_2)
     135  %call22 = call <4 x i32> @himask_2()
     136  %call23 = call <8 x i16> @const16_1()
     137  call void @s2p_step_ir(<4 x i32> %s6, <4 x i32> %s7, <4 x i32> %call22, <8 x i16> %call23, <4 x i32>* %bit00224466_3, <4 x i32>* %bit11335577_3)
     138  %p23 = load <4 x i32>* %bit00224466_0, align 16
     139  %p24 = load <4 x i32>* %bit00224466_1, align 16
     140  %call24 = call <4 x i32> @himask_4()
     141  %call25 = call <8 x i16> @const16_2()
     142  call void @s2p_step_ir(<4 x i32> %p23, <4 x i32> %p24, <4 x i32> %call24, <8 x i16> %call25, <4 x i32>* %bit00004444_0, <4 x i32>* %bit22226666_0)
     143  %p25 = load <4 x i32>* %bit00224466_2, align 16
     144  %p26 = load <4 x i32>* %bit00224466_3, align 16
     145  %call26 = call <4 x i32> @himask_4()
     146  %call27 = call <8 x i16> @const16_2()
     147  call void @s2p_step_ir(<4 x i32> %p25, <4 x i32> %p26, <4 x i32> %call26, <8 x i16> %call27, <4 x i32>* %bit00004444_1, <4 x i32>* %bit22226666_1)
     148  %p27 = load <4 x i32>* %bit11335577_0, align 16
     149  %p28 = load <4 x i32>* %bit11335577_1, align 16
     150  %call28 = call <4 x i32> @himask_4()
     151  %call29 = call <8 x i16> @const16_2()
     152  call void @s2p_step_ir(<4 x i32> %p27, <4 x i32> %p28, <4 x i32> %call28, <8 x i16> %call29, <4 x i32>* %bit11115555_0, <4 x i32>* %bit33337777_0)
     153  %p29 = load <4 x i32>* %bit11335577_2, align 16
     154  %p30 = load <4 x i32>* %bit11335577_3, align 16
     155  %call30 = call <4 x i32> @himask_4()
     156  %call31 = call <8 x i16> @const16_2()
     157  call void @s2p_step_ir(<4 x i32> %p29, <4 x i32> %p30, <4 x i32> %call30, <8 x i16> %call31, <4 x i32>* %bit11115555_1, <4 x i32>* %bit33337777_1)
     158
     159  %p31 = load <4 x i32>* %bit00004444_0, align 16
     160  %p32 = load <4 x i32>* %bit00004444_1, align 16
     161  %call32 = call <4 x i32> @himask_8()
     162  %call33 = call <8 x i16> @const16_4()
     163  call void @s2p_step_ir(<4 x i32> %p31, <4 x i32> %p32, <4 x i32> %call32, <8 x i16> %call33, <4 x i32>* %p0, <4 x i32>* %p4)
     164  %p33 = load <4 x i32>* %bit11115555_0, align 16
     165  %p34 = load <4 x i32>* %bit11115555_1, align 16
     166  %call36 = call <4 x i32> @himask_8()
     167  %call37 = call <8 x i16> @const16_4()
     168  call void @s2p_step_ir(<4 x i32> %p33, <4 x i32> %p34, <4 x i32> %call36, <8 x i16> %call37, <4 x i32>* %p1, <4 x i32>* %p5)
     169  %p35 = load <4 x i32>* %bit22226666_0, align 16
     170  %p36 = load <4 x i32>* %bit22226666_1, align 16
     171  %call40 = call <4 x i32> @himask_8()
     172  %call41 = call <8 x i16> @const16_4()
     173  call void @s2p_step_ir(<4 x i32> %p35, <4 x i32> %p36, <4 x i32> %call40, <8 x i16> %call41, <4 x i32>* %p2, <4 x i32>* %p6)
     174  %p37 = load <4 x i32>* %bit33337777_0, align 16
     175  %p38 = load <4 x i32>* %bit33337777_1, align 16
     176  %call44 = call <4 x i32> @himask_8()
     177  %call45 = call <8 x i16> @const16_4()
     178  call void @s2p_step_ir(<4 x i32> %p37, <4 x i32> %p38, <4 x i32> %call44, <8 x i16> %call45, <4 x i32>* %p3, <4 x i32>* %p7)
     179
     180  ret void
     181}
    108182;TODO: all the packh/l below need to swap aa and bb, because of the endings.
    109183;define <4 x i32> @packh_8(<4 x i32> %a, <4 x i32> %b) alwaysinline {
Note: See TracChangeset for help on using the changeset viewer.