| 175 | The {{{simd<16>::packh}}} and {{{simd<16>::packl}}} operations on 64-bit vectors |
| 176 | arranged into 16-bit fields are illustrated by the following examples. |
| 177 | |
| 178 | || s0 || 0x0123 || 0x4567 || 0x89AB || 0xCDEF || |
| 179 | || s1 || 0xaabb || 0xccdd || 0xeeff || 0x0011 || |
| 180 | || simd<16>::packh(s0, s1) || 0x0145 || 0x89CD|| 0xaacc || 0xee00 || |
| 181 | || simd<16>::packl(s0, s1) || 0x2367 || 0xABEF|| 0xbbdd || 0xff11 || |
| 182 | |
| 183 | === Three-Stage Transposition Using Byte Packing === |
| 184 | |
| 185 | The following library code of the Parabix system implements three-stage |
| 186 | transposition using byte packing. |
| 187 | |
| 188 | {{{ |
| 189 | #define s2p_step(s0, s1, hi_mask, shift, p0, p1) \ |
| 190 | do {\ |
| 191 | BitBlock t0,t1;\ |
| 192 | t0 = hsimd<16>::packh(s0, s1);\ |
| 193 | t1 = hsimd<16>::packl(s0, s1);\ |
| 194 | p0 = simd<1>::ifh(hi_mask, t0, simd<16>::srli<shift>(t1));\ |
| 195 | p1 = simd<1>::ifh(hi_mask, simd<16>::slli<shift>(t0), t1);\ |
| 196 | } while(0) |
| 197 | |
| 198 | #define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \ |
| 199 | do {\ |
| 200 | BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\ |
| 201 | BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\ |
| 202 | BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\ |
| 203 | BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\ |
| 204 | s2p_step(s0,s1,simd<2>::himask(),1,bit00224466_0,bit11335577_0);\ |
| 205 | s2p_step(s2,s3,simd<2>::himask(),1,bit00224466_1,bit11335577_1);\ |
| 206 | s2p_step(s4,s5,simd<2>::himask(),1,bit00224466_2,bit11335577_2);\ |
| 207 | s2p_step(s6,s7,simd<2>::himask(),1,bit00224466_3,bit11335577_3);\ |
| 208 | s2p_step(bit00224466_0,bit00224466_1,simd<4>::himask(),2,bit00004444_0,bit22226666_0);\ |
| 209 | s2p_step(bit00224466_2,bit00224466_3,simd<4>::himask(),2,bit00004444_1,bit22226666_1);\ |
| 210 | s2p_step(bit11335577_0,bit11335577_1,simd<4>::himask(),2,bit11115555_0,bit33337777_0);\ |
| 211 | s2p_step(bit11335577_2,bit11335577_3,simd<4>::himask(),2,bit11115555_1,bit33337777_1);\ |
| 212 | s2p_step(bit00004444_0,bit00004444_1,simd<8>::himask(),4,p0,p4);\ |
| 213 | s2p_step(bit11115555_0,bit11115555_1,simd<8>::himask(),4,p1,p5);\ |
| 214 | s2p_step(bit22226666_0,bit22226666_1,simd<8>::himask(),4,p2,p6);\ |
| 215 | s2p_step(bit33337777_0,bit33337777_1,simd<8>::himask(),4,p3,p7);\ |
| 216 | } while(0) |
| 217 | |
| 218 | }}} |
| 219 | |
| 220 | |
| 221 | === Byte Pack Using SSE2 {{{packuswb}}} === |
| 222 | |
193 | | |
| 241 | === Byte Pack Using Byte Shuffling === |
| 242 | |
| 243 | Byte pack operations are also conveniently modeled as byte shuffle operations. |
| 244 | Thus, LLVM versions of pack are straightforward. |
| 245 | {{{ |
| 246 | define <8 x i16> @hsimd_packh_16(<8 x i16> %x, <8 x i16> %y) { |
| 247 | %result = shufflevector <8 x i16> %x, <8 x i16> %y, |
| 248 | <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> |
| 249 | return <8 x i16> result |
| 250 | } |
| 251 | define <8 x i16> @hsimd_packl_16(<8 x i16> %x, <8 x i16> %y) { |
| 252 | %result = shufflevector <8 x i16> %x, <8 x i16> %y, |
| 253 | <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> |
| 254 | return <8 x i16> result |
| 255 | } |
| 256 | }}} |
| 257 | |
| 258 | Given {{{shufflevector}}} operations satisfying the byte pack model, |
| 259 | an LLVM code generator can conceivably produce the SSE2-based implementations |
| 260 | shown previously, while an SSE3-based implementation might directly use |
| 261 | the byte shuffle operation {{{pshufb}}}. |
| 262 | |