Changeset 1402

Ignore:
Timestamp:
Aug 31, 2011, 8:59:07 AM (8 years ago)
Message:

Tighten slide timing.

Location:
docs/EuroPar2011/Pres
Files:
2 edited

Legend:

Unmodified
 r1401 \end{block} \begin{block}<+->{Finite State Transitions at 500 GHz?} \begin{block}<+->{Finite State Transitions at 200 GHz?} \begin{itemize} \item Use addition for parallel scanning. \item Carry propagation between bit positions = FSM transition. \item Commodity processors @ 3 GHz, 3 additions/cycle. \item Commodity processors @ 3.1 GHz. \item 64 carry propagations per addition. \item Effective speed for FSM transitions: $3 \times 3 \times 64 > 500$ GHz. \item Effective speed for FSM transitions: $3.1 \times 64 > 200$ GHz. \end{itemize} \end{block} \frametitle{Parallel Bit Streams: A Transform Representation of Text} \begin{itemize}[<+->] \begin{itemize} \item Given a byte-oriented character stream $T$, e.g., {\tt Ab17;}''. \item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$. \item Each stream $b_k$ comprises bit $k$ of each byte of $T$. \end{itemize} \onslide<4->{ \begin{tabular}{|c|c|c|c|c|c|} \hline $T$ & A & b & 1 & 7 & ; \pause \\ \hline ASCII & \alert<6>{0}\alert<7>{1}\alert<8>{0}00001 & \alert<6>{0}\alert<7>{1}\alert<8>{1}00010 & \alert<6>{0}\alert<7>{0}\alert<8>{1}10001 & \alert<6>{0}\alert<7>{0}\alert<8>{1}10111 & \alert<6>{0}\alert<7>{0}\alert<8>{1}11011 \pause \\ \hline $b_0$ & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} \pause \\ \hline $b_1$ & \alert<7>{1} & \alert<7>{1} & \alert<7>{0} & \alert<7>{0} & \alert<7>{0} \pause \\ \hline $b_2$ & \alert<8>{0} & \alert<8>{1} & \alert<8>{1} & \alert<8>{1} & \alert<8>{1}  \\ \hline $T$ & A & b & 1 & 7 & ; \\ \hline ASCII & \alert<2>{0}\alert<3>{1}\alert<4>{0}00001 & \alert<2>{0}\alert<3>{1}\alert<4>{1}00010 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10001 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10111 & \alert<2>{0}\alert<3>{0}\alert<4>{1}11011 \pause \\ \hline $b_0$ & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} \pause \\ \hline $b_1$ & \alert<3>{1} & \alert<3>{1} & \alert<3>{0} & \alert<3>{0} & \alert<3>{0} \pause \\ \hline $b_2$ & \alert<4>{0} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1}  \\ \hline $b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline $b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline $b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline \end{tabular} } \end{frame} Why form parallel bit streams? \begin{itemize}[<+->] \begin{itemize} \item Byte-at-a-time text processing is too slow. \begin{itemize}[<+->] \begin{itemize} \item Example: XML scan for {\tt <}''. \item Byte-at-time loop computes only 1 info bit per iteration! \end{itemize} \item So let's compute those bits in parallel! \begin{itemize}[<+->] \begin{itemize} \item Bitwise logic on basis streams $b_i \rightarrow${\tt [<]} stream. \item Process 128 positions at a time using SSE registers. \end{itemize} \item Find next {\tt <}'' with {\em bit scan} instruction (e.g., Intel bsf). \begin{itemize}[<+->] \begin{itemize} \item Advance up to 63 positions at once. \end{itemize} \begin{frame}[fragile] \frametitle{Character Class Formation} \begin{itemize}[<+->] \begin{itemize} \item Combining 8 bits of a code unit gives a character class stream. \item compile({\tt [CharDef(LAngle, "<")]}) \begin{frame}[fragile] \frametitle{Character Class Common Subexpressions} \begin{itemize}[<+->] \begin{itemize} \item Multiple definitions have common subexpressions. \item compile({\tt [CharDef(LAngle, "<"), \\ CharDef(RAngle, "<")]}) \item compile({\tt [CharDef(LAngle, "<"), \\ \alert{CharDef(RAngle, "<")}]}) \item \begin{semiverbatim} temp6 = simd_andc(temp4, temp5); LAngle = simd_and(temp3, temp6); \onslide<4->temp7 = simd_andc(bit[6], bit[7]); \onslide<4->temp8 = simd_and(temp4, temp7); \onslide<4->RAngle = simd_and(temp3, temp8); \alert{temp7 = simd_andc(bit[6], bit[7]); temp8 = simd_and(temp4, temp7); RAngle = simd_and(temp3, temp8);} \end{semiverbatim} \end{itemize} \begin{frame}[fragile] \frametitle{Character Class Ranges} \begin{itemize}[<+->] \begin{itemize} \item Ranges of characters are often very simple to compute. \item compile({\tt [CharSet('Control', ['$\backslash$x00-$\backslash$x1F']), \onslide<4->{CharSet('Digit', ['0-9']})]]}) {CharSet('Digit', ['0-9']})]]}) \item \begin{semiverbatim} temp2 = simd_or(temp1, bit[2]); Control = simd_andc(simd_const_1(1), temp2) \onslide<5->temp3 = simd_and(bit[2], bit[3]); \onslide<5->temp4 = simd_andc(temp3, temp1); \onslide<5->temp5 = simd_or(bit[5], bit[6]); \onslide<5->temp6 = simd_and(bit[4], temp5); \onslide<5->Digit = simd_andc(temp4, temp6); temp3 = simd_and(bit[2], bit[3]); temp4 = simd_andc(temp3, temp1); temp5 = simd_or(bit[5], bit[6]); temp6 = simd_and(bit[4], temp5); Digit = simd_andc(temp4, temp6); \end{semiverbatim} \end{itemize} \begin{frame}[fragile] \frametitle{From Sequential to Parallel Scanning} \begin{itemize}[<+->] \item Parabix 1: use bit scan instructions throughout. \begin{block}<+->{Parabix 1: Use Bit Scan Instructions} \begin{itemize} \item For example, to find markup, bit scan to next 1 in [{\tt <}]. \item Accelerates scanning, but still sequential. \end{itemize} \item Parabix 2: use new parallel scanning primitive. \end{block} \begin{block}<+->{Parabix 2 : Parallel Scanning Primitive} \begin{itemize} \item $s(M, C) = (M + C) \wedge \neg C$ \item Relies on little-endian representation of streams. \end{itemize} \end{itemize} \end{block} \end{frame} \onslide<2->$T$     &  {\tt --935---7---29456---7--23--6--} \\ \onslide<3->$M_0$   &  {\tt ....1...........1.......1...1.} \\ \onslide<3->$D$     &  {\tt ..111...1...11111...1..11..1..} \\ \onslide<4->$D$     &  {\tt ..111...1...11111...1..11..1..} \\ \onslide<5->$M_0 +D$ & {\tt .1......1..1........1.1....11.} \\ \onslide<6->\$M_1 = (M_0 + D)