Changeset 4051 for trunk/lib_ir/p2s.ll


Ignore:
Timestamp:
Aug 19, 2014, 9:15:30 PM (5 years ago)
Author:
linmengl
Message:

add pure ir p2s, get the same performance on SSE2 now

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib_ir/p2s.ll

    r4050 r4051  
    22declare <4 x i32> @srli_16(<4 x i32> %a, <8 x i16> %shift_mask)
    33declare <4 x i32> @slli_16(<4 x i32> %a, <8 x i16> %shift_mask)
     4
     5declare <8 x i16> @const16_1()
     6declare <8 x i16> @const16_2()
     7declare <8 x i16> @const16_4()
     8declare <4 x i32> @himask_2()
     9declare <4 x i32> @himask_4()
     10declare <4 x i32> @himask_8()
    411
    512define <4 x i32> @mergeh_8(<4 x i32> %a, <4 x i32> %b) alwaysinline {
     
    3845  ret void
    3946}
     47
     48define void @p2s_bytemerge_ir(<4 x i32> %p0, <4 x i32> %p1, <4 x i32> %p2, <4 x i32> %p3, <4 x i32> %p4, <4 x i32> %p5, <4 x i32> %p6, <4 x i32> %p7, <4 x i32>* %s0, <4 x i32>* %s1, <4 x i32>* %s2, <4 x i32>* %s3, <4 x i32>* %s4, <4 x i32>* %s5, <4 x i32>* %s6, <4 x i32>* %s7) {
     49entry:
     50  %bit00004444_0 = alloca <4 x i32>, align 16
     51  %bit22226666_0 = alloca <4 x i32>, align 16
     52  %bit00004444_1 = alloca <4 x i32>, align 16
     53  %bit22226666_1 = alloca <4 x i32>, align 16
     54  %bit11115555_0 = alloca <4 x i32>, align 16
     55  %bit33337777_0 = alloca <4 x i32>, align 16
     56  %bit11115555_1 = alloca <4 x i32>, align 16
     57  %bit33337777_1 = alloca <4 x i32>, align 16
     58  %bit00224466_0 = alloca <4 x i32>, align 16
     59  %bit00224466_1 = alloca <4 x i32>, align 16
     60  %bit00224466_2 = alloca <4 x i32>, align 16
     61  %bit00224466_3 = alloca <4 x i32>, align 16
     62  %bit11335577_0 = alloca <4 x i32>, align 16
     63  %bit11335577_1 = alloca <4 x i32>, align 16
     64  %bit11335577_2 = alloca <4 x i32>, align 16
     65  %bit11335577_3 = alloca <4 x i32>, align 16
     66
     67  %call10 = call <4 x i32> @himask_8()
     68  %call11 = call <8 x i16> @const16_4()
     69  call void @p2s_step_ir(<4 x i32> %p0, <4 x i32> %p4, <4 x i32> %call10, <8 x i16> %call11, <4 x i32>* %bit00004444_0, <4 x i32>* %bit00004444_1)
     70
     71  %call14 = call <4 x i32> @himask_8()
     72  %call15 = call <8 x i16> @const16_4()
     73  call void @p2s_step_ir(<4 x i32> %p1, <4 x i32> %p5, <4 x i32> %call14, <8 x i16> %call15, <4 x i32>* %bit11115555_0, <4 x i32>* %bit11115555_1)
     74
     75  %call18 = call <4 x i32> @himask_8()
     76  %call19 = call <8 x i16> @const16_4()
     77  call void @p2s_step_ir(<4 x i32> %p2, <4 x i32> %p6, <4 x i32> %call18, <8 x i16> %call19, <4 x i32>* %bit22226666_0, <4 x i32>* %bit22226666_1)
     78
     79  %call22 = call <4 x i32> @himask_8()
     80  %call23 = call <8 x i16> @const16_4()
     81  call void @p2s_step_ir(<4 x i32> %p3, <4 x i32> %p7, <4 x i32> %call22, <8 x i16> %call23, <4 x i32>* %bit33337777_0, <4 x i32>* %bit33337777_1)
     82
     83  %p23 = load <4 x i32>* %bit00004444_0, align 16
     84  %p24 = load <4 x i32>* %bit22226666_0, align 16
     85  %call24 = call <4 x i32> @himask_4()
     86  %call25 = call <8 x i16> @const16_2()
     87  call void @p2s_step_ir(<4 x i32> %p23, <4 x i32> %p24, <4 x i32> %call24, <8 x i16> %call25, <4 x i32>* %bit00224466_0, <4 x i32>* %bit00224466_1)
     88
     89  %p25 = load <4 x i32>* %bit11115555_0, align 16
     90  %p26 = load <4 x i32>* %bit33337777_0, align 16
     91  %call26 = call <4 x i32> @himask_4()
     92  %call27 = call <8 x i16> @const16_2()
     93  call void @p2s_step_ir(<4 x i32> %p25, <4 x i32> %p26, <4 x i32> %call26, <8 x i16> %call27, <4 x i32>* %bit11335577_0, <4 x i32>* %bit11335577_1)
     94
     95  %p27 = load <4 x i32>* %bit00004444_1, align 16
     96  %p28 = load <4 x i32>* %bit22226666_1, align 16
     97  %call28 = call <4 x i32> @himask_4()
     98  %call29 = call <8 x i16> @const16_2()
     99  call void @p2s_step_ir(<4 x i32> %p27, <4 x i32> %p28, <4 x i32> %call28, <8 x i16> %call29, <4 x i32>* %bit00224466_2, <4 x i32>* %bit00224466_3)
     100
     101  %p29 = load <4 x i32>* %bit11115555_1, align 16
     102  %p30 = load <4 x i32>* %bit33337777_1, align 16
     103  %call30 = call <4 x i32> @himask_4()
     104  %call31 = call <8 x i16> @const16_2()
     105  call void @p2s_step_ir(<4 x i32> %p29, <4 x i32> %p30, <4 x i32> %call30, <8 x i16> %call31, <4 x i32>* %bit11335577_2, <4 x i32>* %bit11335577_3)
     106
     107  %p31 = load <4 x i32>* %bit00224466_0, align 16
     108  %p32 = load <4 x i32>* %bit11335577_0, align 16
     109  %call32 = call <4 x i32> @himask_2()
     110  %call33 = call <8 x i16> @const16_1()
     111  call void @p2s_step_ir(<4 x i32> %p31, <4 x i32> %p32, <4 x i32> %call32, <8 x i16> %call33, <4 x i32>* %s0, <4 x i32>* %s1)
     112
     113  %p33 = load <4 x i32>* %bit00224466_1, align 16
     114  %p34 = load <4 x i32>* %bit11335577_1, align 16
     115  %call36 = call <4 x i32> @himask_2()
     116  %call37 = call <8 x i16> @const16_1()
     117  call void @p2s_step_ir(<4 x i32> %p33, <4 x i32> %p34, <4 x i32> %call36, <8 x i16> %call37, <4 x i32>* %s2, <4 x i32>* %s3)
     118
     119  %p35 = load <4 x i32>* %bit00224466_2, align 16
     120  %p36 = load <4 x i32>* %bit11335577_2, align 16
     121  %call40 = call <4 x i32> @himask_2()
     122  %call41 = call <8 x i16> @const16_1()
     123  call void @p2s_step_ir(<4 x i32> %p35, <4 x i32> %p36, <4 x i32> %call40, <8 x i16> %call41, <4 x i32>* %s4, <4 x i32>* %s5)
     124
     125  %p37 = load <4 x i32>* %bit00224466_3, align 16
     126  %p38 = load <4 x i32>* %bit11335577_3, align 16
     127  %call44 = call <4 x i32> @himask_2()
     128  %call45 = call <8 x i16> @const16_1()
     129  call void @p2s_step_ir(<4 x i32> %p37, <4 x i32> %p38, <4 x i32> %call44, <8 x i16> %call45, <4 x i32>* %s6, <4 x i32>* %s7)
     130
     131  ret void
     132}
Note: See TracChangeset for help on using the changeset viewer.