 Timestamp:
 Sep 15, 2013, 3:16:08 PM (6 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

docs/Working/re/avx2.tex
r3498 r3500 1 1 \section{SIMD Scalability}\label{sec:AVX2} 2 2 3 4 Although commodity processors have provided 128bit SIMD operations 5 more than a decade, the extension to 256bit integer SIMD operations 6 has just recently taken place with the availability of AVX2 7 instructions in Intel Haswell architecture chips as of mid 2013. 8 This provides an excellent opportunity to assess the scalability 9 of the bitwise dataparallel approach to regular expression matching. 10 11 For the most part, adapting the Parabix tool chain to the new AVX2 12 instructions was straightforward. This mostly involved regenerating 13 library functions using the new AVX2 intrinsics. There were minor 14 issues in the core transposition algorithm because the doublebytetobyte 15 pack instructions are confined to independent operation within two 16 128bit lanes. 17 18 19 \subsection{AVX Stream Addition} 20 \begin{figure*}[tbh] 21 \begin{center} 22 \begin{verbatim} 23 void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) { 24 bitblock256_t all_ones = simd256<1>::constant<1>(); 25 bitblock256_t gen = simd_and(x, y); 26 bitblock256_t prop = simd_xor(x, y); 27 bitblock256_t partial_sum = simd256<64>::add(x, y); 28 bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum)); 29 bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones); 30 uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in); 31 uint64_t bubble_mask = hsimd256<64>::signmask(bubble); 32 uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask; 33 uint64_t increments = carry_scan_thru_bubbles  (carry_scan_thru_bubbles  carry_mask); 34 carry_out = convert(increments >> 4); 35 uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001; 36 sum = simd256<64>::add(partial_sum, _mm256_cvtepu16_epi64(avx_select_lo128(convert(spread)))); 37 } 38 39 \end{verbatim} 40 41 \end{center} 42 \caption{AVX2 256bit Addition} 43 \label{fig:AVX2add} 44 45 \end{figure*} 46 47 Bitstream addition at the 256bit block size was implemented using the 48 longstream addition technique. Figure \ref{fig:AVX2add} shows our 49 implementation. Spreading bits from the calculated increments mask 50 was achieved somewhat awkwardly with a 64bit multiply to spread 51 into 16bit fields followed by SIMD zero extend of the 16bit fields 52 to 64bits each. 53 54 We also compiled new versions of the {\tt grep} and {\tt nrgrep} programs 55 using the {\tt march=coreavx2} flag in case the compiler is able 56 to vectorize some of the code. 57 58 \begin{figure} 59 \begin{center} 60 \begin{tikzpicture} 61 \begin{axis}[ 62 xtick=data, 63 ylabel=AVX Instruction Reduction, 64 xticklabels={@,Date,Email,URIorEmail,xquote}, 65 tick label style={font=\tiny}, 66 enlarge x limits=0.15, 67 enlarge y limits={0.15, upper}, 68 ymin=0, 69 legend style={at={(0.5,0.15)}, 70 anchor=north,legend columns=1}, 71 ybar, 72 bar width=7pt, 73 ] 74 \addplot[fill=black] 75 file {data/avxinstructions1.dat}; 76 \addplot[fill=gray] 77 file {data/avxinstructions2.dat}; 78 \addplot[fill=white] 79 file {data/avxinstructions3.dat}; 80 81 \legend{Bitstreams,NRGrep,Grep,Annot} 82 \end{axis} 83 \end{tikzpicture} 84 \end{center} 85 \caption{Instruction Reduction}\label{fig:AVXInstrReduction} 86 \end{figure} 87 88 89 Figure \ref{fig:AVXInstrReduction} shows the reduction in instruction 90 count achieved for each of the applications. Working at a block 91 size of 256 bytes at a time rather than 128 bytes at a time, 92 the bitstreams implementation scaled dramatically well with reductions in 93 instruction count over a factor of two in each case. Although a factor 94 of two would seem an outside limit, we attribute the change to 95 greater instruction efficiency. 96 AVX2 instructions use a 97 non destructive threeoperand 98 form instead of the destructive twooperand form of SSE2. 99 In the twooperand form, binary instructions must always used 100 one of the source registers as a destination register. As a 101 result the SSE2 object code generates many data movement operations 102 that are unnecessary with the AVX2 set. 103 104 As expected, there was no observable reduction in instruction 105 count with the recompiled grep and nrgrep applications. 3 106 4 107 … … 31 134 \end{tikzpicture} 32 135 \end{center} 33 \caption{AVX Speedup} 136 \caption{AVX Speedup}\label{fig:AVXSpeedup} 34 137 \end{figure} 35 138 36 \begin{figure} 37 \begin{center} 38 \begin{tikzpicture} 39 \begin{axis}[ 40 xtick=data, 41 ylabel=AVX Instruction Reduction, 42 xticklabels={@,Date,Email,URIorEmail,xquote}, 43 tick label style={font=\tiny}, 44 enlarge x limits=0.15, 45 enlarge y limits={0.15, upper}, 46 ymin=0, 47 legend style={at={(0.5,0.15)}, 48 anchor=north,legend columns=1}, 49 ybar, 50 bar width=7pt, 51 ] 52 \addplot[fill=black] 53 file {data/avxinstructions1.dat}; 54 \addplot[fill=gray] 55 file {data/avxinstructions2.dat}; 56 \addplot[fill=white] 57 file {data/avxinstructions3.dat}; 139 As shown in Figure \ref{fig:AVXSpeedup} the reduction in 140 instruction count was reflected in a considerable speedup 141 in the bitstreams implementation. However, the speedup was 142 considerably less than expected. As shown in \label{fig:AVXIPC} 143 the AVX2 version has lost some of the superscalar efficiency 144 of the SSE2 code. This is a performance debugging issue 145 that we have yet to resolve. 58 146 59 \legend{Bitstreams,NRGrep,Grep,Annot}60 \end{axis}61 \end{tikzpicture}62 \end{center}63 \caption{Instruction Reduction}64 \end{figure}65 147 66 148 \begin{figure} … … 93 175 \end{tikzpicture} 94 176 \end{center} 95 \caption{Change in Instructions Per Cycle With AVX} 177 \caption{Change in Instructions Per Cycle With AVX}\label{fig:AVXIPC} 96 178 \end{figure} 97 179 180 Overall, the results on our AVX2 machine were quite good, 181 demonstrating very good scalability of the bitwise dataparallel approach. 98 182 99 183 100 101 \subsection{AVX Stream Addition}102 \begin{figure*}[tbh]103 \begin{center}104 \begin{verbatim}105 void add_ci_co(bitblock256_t x, bitblock256_t y, carry_t carry_in, carry_t & carry_out, bitblock256_t & sum) {106 bitblock256_t all_ones = simd256<1>::constant<1>();107 bitblock256_t gen = simd_and(x, y);108 bitblock256_t prop = simd_xor(x, y);109 bitblock256_t partial_sum = simd256<64>::add(x, y);110 bitblock256_t carry = simd_or(gen, simd_andc(prop, partial_sum));111 bitblock256_t bubble = simd256<64>::eq(partial_sum, all_ones);112 uint64_t carry_mask = hsimd256<64>::signmask(carry) * 2 + convert(carry_in);113 uint64_t bubble_mask = hsimd256<64>::signmask(bubble);114 uint64_t carry_scan_thru_bubbles = (carry_mask + bubble_mask) &~ bubble_mask;115 uint64_t increments = carry_scan_thru_bubbles  (carry_scan_thru_bubbles  carry_mask);116 carry_out = convert(increments >> 4);117 uint64_t spread = 0x0000200040008001 * increments & 0x0001000100010001;118 sum = simd256<64>::add(partial_sum, _mm256_cvtepu16_epi64(avx_select_lo128(convert(spread))));119 }120 121 \end{verbatim}122 123 \end{center}124 \caption{AVX2 256bit Addition}125 \label{fig:AVX2add}126 \end{figure*}127
Note: See TracChangeset
for help on using the changeset viewer.