Changeset 3933 for trunk


Ignore:
Timestamp:
Jul 28, 2014, 1:03:33 PM (4 years ago)
Author:
linmengl
Message:

IR inline s2p_bytepack, get the same perf result of SSE2 now.

Location:
trunk/lib_ir
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/lib_ir/s2p.h

    r3922 r3933  
    2020  BitBlock slli_16(BitBlock a, BitBlock shift_mask);
    2121
    22   BitBlock s2p_step_ir(BitBlock s0, BitBlock s1, BitBlock hi_mask, BitBlock shift_mask, BitBlock *p0, BitBlock *p1);
     22  void s2p_step_ir(BitBlock s0, BitBlock s1, BitBlock hi_mask, BitBlock shift_mask, BitBlock *p0, BitBlock *p1);
     23
     24  void s2p_bytepack_ir(BitBlock s0, BitBlock s1, BitBlock s2, BitBlock s3, BitBlock s4, BitBlock s5, BitBlock s6, BitBlock s7, BitBlock* p0, BitBlock* p1, BitBlock* p2, BitBlock* p3, BitBlock* p4, BitBlock* p5, BitBlock* p6, BitBlock* p7);
    2325
    2426  BitBlock const16_1();
     
    3133}
    3234
    33 #define S2P_ALGORITHM s2p_bytepack
     35//S2P_ALGORITHM: s2p_bytepack_inline, written in pure IR in order to use
     36//immediate constants in shifting.
     37#define S2P_ALGORITHM s2p_bytepack_inline
    3438
     39#define s2p_bytepack_inline(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
     40  s2p_bytepack_ir(s0, s1, s2, s3, s4, s5, s6, s7, &p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7)
     41
     42//This macro is deprecated. Use inline version.
    3543#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
    3644  do {\
  • trunk/lib_ir/s2p.ll

    r3927 r3933  
    105105}
    106106
    107 
     107define void @s2p_bytepack_ir(<4 x i32> %s0, <4 x i32> %s1, <4 x i32> %s2, <4 x i32> %s3, <4 x i32> %s4, <4 x i32> %s5, <4 x i32> %s6, <4 x i32> %s7, <4 x i32>* %p0, <4 x i32>* %p1, <4 x i32>* %p2, <4 x i32>* %p3, <4 x i32>* %p4, <4 x i32>* %p5, <4 x i32>* %p6, <4 x i32>* %p7) {
     108entry:
     109  %bit00224466_0 = alloca <4 x i32>, align 16
     110  %bit00224466_1 = alloca <4 x i32>, align 16
     111  %bit00224466_2 = alloca <4 x i32>, align 16
     112  %bit00224466_3 = alloca <4 x i32>, align 16
     113  %bit11335577_0 = alloca <4 x i32>, align 16
     114  %bit11335577_1 = alloca <4 x i32>, align 16
     115  %bit11335577_2 = alloca <4 x i32>, align 16
     116  %bit11335577_3 = alloca <4 x i32>, align 16
     117  %bit00004444_0 = alloca <4 x i32>, align 16
     118  %bit22226666_0 = alloca <4 x i32>, align 16
     119  %bit00004444_1 = alloca <4 x i32>, align 16
     120  %bit22226666_1 = alloca <4 x i32>, align 16
     121  %bit11115555_0 = alloca <4 x i32>, align 16
     122  %bit33337777_0 = alloca <4 x i32>, align 16
     123  %bit11115555_1 = alloca <4 x i32>, align 16
     124  %bit33337777_1 = alloca <4 x i32>, align 16
     125
     126  %call10 = call <4 x i32> @himask_2()
     127  %call11 = call <8 x i16> @const16_1()
     128  call void @s2p_step_ir(<4 x i32> %s0, <4 x i32> %s1, <4 x i32> %call10, <8 x i16> %call11, <4 x i32>* %bit00224466_0, <4 x i32>* %bit11335577_0)
     129  %call14 = call <4 x i32> @himask_2()
     130  %call15 = call <8 x i16> @const16_1()
     131  call void @s2p_step_ir(<4 x i32> %s2, <4 x i32> %s3, <4 x i32> %call14, <8 x i16> %call15, <4 x i32>* %bit00224466_1, <4 x i32>* %bit11335577_1)
     132  %call18 = call <4 x i32> @himask_2()
     133  %call19 = call <8 x i16> @const16_1()
     134  call void @s2p_step_ir(<4 x i32> %s4, <4 x i32> %s5, <4 x i32> %call18, <8 x i16> %call19, <4 x i32>* %bit00224466_2, <4 x i32>* %bit11335577_2)
     135  %call22 = call <4 x i32> @himask_2()
     136  %call23 = call <8 x i16> @const16_1()
     137  call void @s2p_step_ir(<4 x i32> %s6, <4 x i32> %s7, <4 x i32> %call22, <8 x i16> %call23, <4 x i32>* %bit00224466_3, <4 x i32>* %bit11335577_3)
     138  %p23 = load <4 x i32>* %bit00224466_0, align 16
     139  %p24 = load <4 x i32>* %bit00224466_1, align 16
     140  %call24 = call <4 x i32> @himask_4()
     141  %call25 = call <8 x i16> @const16_2()
     142  call void @s2p_step_ir(<4 x i32> %p23, <4 x i32> %p24, <4 x i32> %call24, <8 x i16> %call25, <4 x i32>* %bit00004444_0, <4 x i32>* %bit22226666_0)
     143  %p25 = load <4 x i32>* %bit00224466_2, align 16
     144  %p26 = load <4 x i32>* %bit00224466_3, align 16
     145  %call26 = call <4 x i32> @himask_4()
     146  %call27 = call <8 x i16> @const16_2()
     147  call void @s2p_step_ir(<4 x i32> %p25, <4 x i32> %p26, <4 x i32> %call26, <8 x i16> %call27, <4 x i32>* %bit00004444_1, <4 x i32>* %bit22226666_1)
     148  %p27 = load <4 x i32>* %bit11335577_0, align 16
     149  %p28 = load <4 x i32>* %bit11335577_1, align 16
     150  %call28 = call <4 x i32> @himask_4()
     151  %call29 = call <8 x i16> @const16_2()
     152  call void @s2p_step_ir(<4 x i32> %p27, <4 x i32> %p28, <4 x i32> %call28, <8 x i16> %call29, <4 x i32>* %bit11115555_0, <4 x i32>* %bit33337777_0)
     153  %p29 = load <4 x i32>* %bit11335577_2, align 16
     154  %p30 = load <4 x i32>* %bit11335577_3, align 16
     155  %call30 = call <4 x i32> @himask_4()
     156  %call31 = call <8 x i16> @const16_2()
     157  call void @s2p_step_ir(<4 x i32> %p29, <4 x i32> %p30, <4 x i32> %call30, <8 x i16> %call31, <4 x i32>* %bit11115555_1, <4 x i32>* %bit33337777_1)
     158
     159  %p31 = load <4 x i32>* %bit00004444_0, align 16
     160  %p32 = load <4 x i32>* %bit00004444_1, align 16
     161  %call32 = call <4 x i32> @himask_8()
     162  %call33 = call <8 x i16> @const16_4()
     163  call void @s2p_step_ir(<4 x i32> %p31, <4 x i32> %p32, <4 x i32> %call32, <8 x i16> %call33, <4 x i32>* %p0, <4 x i32>* %p4)
     164  %p33 = load <4 x i32>* %bit11115555_0, align 16
     165  %p34 = load <4 x i32>* %bit11115555_1, align 16
     166  %call36 = call <4 x i32> @himask_8()
     167  %call37 = call <8 x i16> @const16_4()
     168  call void @s2p_step_ir(<4 x i32> %p33, <4 x i32> %p34, <4 x i32> %call36, <8 x i16> %call37, <4 x i32>* %p1, <4 x i32>* %p5)
     169  %p35 = load <4 x i32>* %bit22226666_0, align 16
     170  %p36 = load <4 x i32>* %bit22226666_1, align 16
     171  %call40 = call <4 x i32> @himask_8()
     172  %call41 = call <8 x i16> @const16_4()
     173  call void @s2p_step_ir(<4 x i32> %p35, <4 x i32> %p36, <4 x i32> %call40, <8 x i16> %call41, <4 x i32>* %p2, <4 x i32>* %p6)
     174  %p37 = load <4 x i32>* %bit33337777_0, align 16
     175  %p38 = load <4 x i32>* %bit33337777_1, align 16
     176  %call44 = call <4 x i32> @himask_8()
     177  %call45 = call <8 x i16> @const16_4()
     178  call void @s2p_step_ir(<4 x i32> %p37, <4 x i32> %p38, <4 x i32> %call44, <8 x i16> %call45, <4 x i32>* %p3, <4 x i32>* %p7)
     179
     180  ret void
     181}
    108182;TODO: all the packh/l below need to swap aa and bb, because of the endings.
    109183;define <4 x i32> @packh_8(<4 x i32> %a, <4 x i32> %b) alwaysinline {
  • trunk/lib_ir/test_s2p.cpp

    r3920 r3933  
    1919  for (int i = 0; i < 8; ++i) s[i] = mvmd<32>::fill(x);
    2020
    21   s2p_bytepack(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
     21  s2p(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
    2222      p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
    2323
  • trunk/lib_ir/xmlwf/perf.txt

    r3929 r3933  
    2525
    2626===========================================
     27Optimised shiftings (Use IR inline for immediate shifting)
     28xmlwf_perf   &  4.544   & 7.787   & 7.196   & 5.727   & 11.386 \\ \hline
     29
     30===========================================
Note: See TracChangeset for help on using the changeset viewer.