Changes between Version 8 and Version 9 of ParabixTransform


Ignore:
Timestamp:
Apr 22, 2014, 6:42:28 PM (4 years ago)
Author:
cameron
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • ParabixTransform

    v8 v9  
    169169}}}
    170170
    171 == Byte Pack Implementations ==
     171== Byte Pack Implementation of the Parabix Transform ==
    172172
    173173Although SIMD units typically do not provide direct bit packing implementations,
    174174''byte packing'' operations that extract bytes from 16-bit fields are common.
     175The {{{simd<16>::packh}}} and {{{simd<16>::packl}}} operations on 64-bit vectors
     176arranged into 16-bit fields are illustrated by the following examples.
     177
     178|| s0 || 0x0123 || 0x4567 || 0x89AB || 0xCDEF ||
     179|| s1 || 0xaabb || 0xccdd || 0xeeff || 0x0011 ||
     180|| simd<16>::packh(s0, s1) || 0x0145 || 0x89CD|| 0xaacc || 0xee00 ||
     181|| simd<16>::packl(s0, s1) || 0x2367 || 0xABEF|| 0xbbdd || 0xff11 ||
     182 
     183=== Three-Stage Transposition Using Byte Packing ===
     184
     185The following library code of the Parabix system implements three-stage
     186transposition using byte packing.
     187
     188{{{
     189#define s2p_step(s0, s1, hi_mask, shift, p0, p1)  \
     190  do {\
     191        BitBlock t0,t1;\
     192        t0 = hsimd<16>::packh(s0, s1);\
     193        t1 = hsimd<16>::packl(s0, s1);\
     194        p0 = simd<1>::ifh(hi_mask, t0, simd<16>::srli<shift>(t1));\
     195        p1 = simd<1>::ifh(hi_mask, simd<16>::slli<shift>(t0), t1);\
     196  } while(0)
     197
     198#define s2p_bytepack(s0, s1, s2, s3, s4, s5, s6, s7, p0, p1, p2, p3, p4, p5, p6, p7) \
     199  do {\
     200        BitBlock bit00224466_0,bit00224466_1,bit00224466_2,bit00224466_3;\
     201        BitBlock bit11335577_0,bit11335577_1,bit11335577_2,bit11335577_3;\
     202        BitBlock bit00004444_0,bit22226666_0,bit00004444_1,bit22226666_1;\
     203        BitBlock bit11115555_0,bit33337777_0,bit11115555_1,bit33337777_1;\
     204        s2p_step(s0,s1,simd<2>::himask(),1,bit00224466_0,bit11335577_0);\
     205        s2p_step(s2,s3,simd<2>::himask(),1,bit00224466_1,bit11335577_1);\
     206        s2p_step(s4,s5,simd<2>::himask(),1,bit00224466_2,bit11335577_2);\
     207        s2p_step(s6,s7,simd<2>::himask(),1,bit00224466_3,bit11335577_3);\
     208        s2p_step(bit00224466_0,bit00224466_1,simd<4>::himask(),2,bit00004444_0,bit22226666_0);\
     209        s2p_step(bit00224466_2,bit00224466_3,simd<4>::himask(),2,bit00004444_1,bit22226666_1);\
     210        s2p_step(bit11335577_0,bit11335577_1,simd<4>::himask(),2,bit11115555_0,bit33337777_0);\
     211        s2p_step(bit11335577_2,bit11335577_3,simd<4>::himask(),2,bit11115555_1,bit33337777_1);\
     212        s2p_step(bit00004444_0,bit00004444_1,simd<8>::himask(),4,p0,p4);\
     213        s2p_step(bit11115555_0,bit11115555_1,simd<8>::himask(),4,p1,p5);\
     214        s2p_step(bit22226666_0,bit22226666_1,simd<8>::himask(),4,p2,p6);\
     215        s2p_step(bit33337777_0,bit33337777_1,simd<8>::himask(),4,p3,p7);\
     216  } while(0)
     217
     218}}}
     219
     220
     221=== Byte Pack Using SSE2 {{{packuswb}}} ===
     222
    175223For example the SSE2 instruction {{{packuswb}}} packs 16-byte integers into
    176224bytes using unsigned saturation.   By clearing the high 8 bits of the 16-bit
     
    191239}}}
    192240
    193 
     241=== Byte Pack Using Byte Shuffling ===
     242
     243Byte pack operations are also conveniently modeled as byte shuffle operations.
     244Thus, LLVM versions of pack are straightforward. 
     245{{{
     246define <8 x i16> @hsimd_packh_16(<8 x i16> %x, <8 x i16> %y) {
     247   %result = shufflevector <8 x i16> %x, <8 x i16> %y,
     248                <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
     249   return <8 x i16> result
     250}
     251define <8 x i16> @hsimd_packl_16(<8 x i16> %x, <8 x i16> %y) {
     252   %result = shufflevector <8 x i16> %x, <8 x i16> %y,
     253                <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
     254   return <8 x i16> result
     255}
     256}}}
     257
     258Given {{{shufflevector}}} operations satisfying the byte pack model,
     259an LLVM code generator can conceivably produce the SSE2-based implementations
     260shown previously, while an SSE3-based implementation might directly use
     261the byte shuffle operation {{{pshufb}}}.
     262