69 | | Horizontal packing operations |
| 69 | As a simple example of a horizontal operation that extracts fields from 2 input vectors, Hughes defines |
| 70 | the {{{veven}}} instruction to select only the even elements from each vector. The LLVM shufflevector |
| 71 | is straightforward. |
| 72 | |
| 73 | {{{ |
| 74 | define <8 x i16> @veven(<8 x i16> %a, <8 x i16> %b) { |
| 75 | entry: |
| 76 | %t0 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> |
| 77 | ret <8 x i16> %t0 |
| 78 | } |
| 79 | }}} |
| 80 | |
| 81 | Compiling to SSE2 code, we get the following. |
| 82 | |
| 83 | {{{ |
| 84 | veven: # @veven |
| 85 | .cfi_startproc |
| 86 | # BB#0: # %entry |
| 87 | pshuflw $-24, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3,4,5,6,7] |
| 88 | pshufhw $-24, %xmm1, %xmm1 # xmm1 = xmm1[0,1,2,3,4,6,6,7] |
| 89 | pshufd $-24, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3] |
| 90 | pshuflw $-24, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3,4,5,6,7] |
| 91 | pshufhw $-24, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2,3,4,6,6,7] |
| 92 | pshufd $-24, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] |
| 93 | punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] |
| 94 | retq |
| 95 | }}} |
| 96 | |
| 97 | But there is a better implementation using the SSE2 {{{packusw}}} instruction! |
| 98 | |
| 99 | {{{ |
| 100 | declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) #1 |
| 101 | |
| 102 | define <8 x i16> @veven_sse2(<8 x i16> %a, <8 x i16> %b) { |
| 103 | entry: |
| 104 | %a0 = and <8 x i16> %a, bitcast (<1 x i128> <i128 1324055902416102970674609367438786815> to <8 x i16>) |
| 105 | %b0 = and <8 x i16> %b, bitcast (<1 x i128> <i128 1324055902416102970674609367438786815> to <8 x i16>) |
| 106 | %r0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %b0) |
| 107 | %r1 = bitcast <16 x i8> %r0 to <8 x i16> |
| 108 | ret <8 x i16> %r1 |
| 109 | } |
| 110 | }}} |
| 111 | |
| 112 | By masking off the high byte of each 16-bit field, we can avoid the "saturation" of |
| 113 | values when converting from 16-bits to 8-bits. |
| 114 | {{{ |
| 115 | .LCPI0_0: |
| 116 | .short 255 # 0xff |
| 117 | .short 255 # 0xff |
| 118 | .short 255 # 0xff |
| 119 | .short 255 # 0xff |
| 120 | .short 255 # 0xff |
| 121 | .short 255 # 0xff |
| 122 | .short 255 # 0xff |
| 123 | .short 255 # 0xff |
| 124 | .text |
| 125 | .globl veven_sse2 |
| 126 | .align 16, 0x90 |
| 127 | .type veven_sse2,@function |
| 128 | veven_sse2: # @veven_sse2 |
| 129 | .cfi_startproc |
| 130 | # BB#0: # %entry |
| 131 | movdqa .LCPI0_0(%rip), %xmm2 # xmm2 = [255,255,255,255,255,255,255,255] |
| 132 | pand %xmm2, %xmm0 |
| 133 | pand %xmm2, %xmm1 |
| 134 | packuswb %xmm1, %xmm0 |
| 135 | retq |
| 136 | }}} |