source: docs/EuroPar2011/Pres/europar-slides.tex @ 1423

Last change on this file since 1423 was 1423, checked in by cameron, 8 years ago

Further improvements

File size: 17.5 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{comment}
4\usepackage{color}
5\usepackage{subfigure}
6\usepackage[latin1]{inputenc}
7\usepackage{verbatim}
8\usepackage{colortbl}
9\usetheme{Warsaw}
10
11
12\title[Euro-Par 2011]{Parallel Scanning with Bitstream Addition: An XML Case Study}
13\author[Cameron, Amiri, Herdy, Lin, Shermer, Popowich]{Rob Cameron
14\and Ehsan Amiri \and Ken Herdy \and Dan Lin  \and Tom Shermer \and Fred Popowich}
15\institute{School of Computing Science\\
16Simon Fraser University
17\and International Characters, Inc.}
18\date{September 1, 2011}
19\def\CI{Core-i3}
20\def\SB{SandyBridge}
21\def\CO{Core2}
22\newenvironment{smalltabbing}{\endgraf\tiny\tiny\tabbing}{\endtabbing} 
23\newenvironment{tinytinyverbatim}{\endgraf\tiny\tiny\verbatim}{\endverbatim} 
24
25\begin{document}
26
27\begin{frame}
28\titlepage
29\end{frame}
30
31\begin{frame}[fragile]
32\frametitle{The 13th Dwarf: Finite State Machines}
33
34\begin{block}<+->{Berkeley Landscape of Parallel Computing Research}
35\begin{itemize}
36\item 13 dwarves for abstract classes of computing problems.
37\item 13th dwarf: finite state machines, parsing.
38\item Considered hardest dwarf to parallelize:
39\begin{itemize}
40\item ``embarassingly sequential'',
41%\item
42``Nothing helps!''
43\end{itemize}
44\end{itemize}
45\end{block}
46
47\begin{block}<+->{64 Finite State Transitions per CPU Cycle}
48\begin{itemize}
49\item Main contribution: addition for parallel scanning.
50\item Carry propagation between bit positions = FSM transition.
51\item 64 carry propagations per addition on commodity processors.
52\item Previously unexploited parallelism.
53\end{itemize}
54\end{block}
55
56\end{frame}
57
58
59
60\begin{frame}
61\frametitle{Outline}
62\tableofcontents%[pausesections]
63\end{frame}
64
65\section{Parallel Bit Stream Technology}
66
67
68
69\begin{frame}
70\frametitle{Parallel Bit Streams: A Transform Representation of Text}
71
72\begin{itemize}
73\item Given a byte-oriented character stream $T$, e.g., ``{\tt Ab17;}''.
74\item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$.
75\item Each stream $b_k$ comprises bit $k$ of each byte of $T$.
76\end{itemize}
77\begin{tabular}{|c|c|c|c|c|c|} \hline
78$T$ & A & b & 1 & 7 & ; \\ \hline
79ASCII & \alert<2>{0}\alert<3>{1}\alert<4>{0}00001 & \alert<2>{0}\alert<3>{1}\alert<4>{1}00010 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10001 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10111 & \alert<2>{0}\alert<3>{0}\alert<4>{1}11011 \pause \\ \hline
80$b_0$ & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} \pause \\ \hline
81$b_1$ & \alert<3>{1} & \alert<3>{1} & \alert<3>{0} & \alert<3>{0} & \alert<3>{0} \pause \\ \hline
82$b_2$ & \alert<4>{0} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1}  \\ \hline
83$b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline
84$b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline
85$b_5$ & 0 & 0 & 0 & 1 & 0 \\ \hline
86$b_6$ & 0 & 1 & 0 & 1 & 1 \\ \hline
87$b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline
88\end{tabular}
89\end{frame}
90
91
92
93\begin{frame}[fragile]
94\frametitle{Goal: High Performance Text Processing}
95
96
97\begin{block}<+->{Byte-at-a-time text processing is too slow.}
98\begin{itemize}
99\item Example: XML scan for ``{\tt <}''.
100\item Byte-at-time loop computes only 1 info bit per iteration!
101\end{itemize}
102\end{block}
103\begin{block}<+->{So let's compute those bits in parallel}
104\begin{itemize}
105\item Bitwise logic on basis streams $b_i \rightarrow${\tt [<]} stream.
106\item Process 128 positions at a time using SSE registers.
107\end{itemize}
108\end{block}
109\begin{block}<+->{Parabix 1 XML Parser}
110\begin{itemize}
111\item Find next ``{\tt <}'' with {\em bit scan} instruction (e.g., Intel bsf).
112\item Advance up to 63 positions at once.
113\end{itemize}
114\end{block}
115\end{frame}
116
117
118\begin{frame}[fragile]
119\frametitle{Character Class Formation}
120\begin{itemize}
121\item Combining 8 bits of a code unit gives a character class stream.
122\item compile({\tt [CharDef(LAngle, "<")]})
123\item 
124\begin{semiverbatim}
125temp1 = simd_or(bit[0], bit[1]);
126temp2 = simd_and(bit[2], bit[3]);
127temp3 = simd_andc(temp2, temp1);
128temp4 = simd_and(bit[4], bit[5]);
129temp5 = simd_or(bit[6], bit[7]);
130temp6 = simd_andc(temp4, temp5);
131LAngle = simd_and(temp3, temp6);
132\end{semiverbatim}
133\end{itemize}
134
135\end{frame}
136
137
138\begin{frame}[fragile]
139\frametitle{Character Class Common Subexpressions}
140\begin{itemize}
141\item Multiple definitions have common subexpressions.
142\item compile({\tt [CharDef(LAngle, "<"), \\ \alert{CharDef(RAngle, ">")}]})
143\item 
144\begin{semiverbatim}
145temp1 = simd_or(bit[0], bit[1]);
146temp2 = simd_and(bit[2], bit[3]);
147temp3 = simd_andc(temp2, temp1);
148temp4 = simd_and(bit[4], bit[5]);
149temp5 = simd_or(bit[6], bit[7]);
150temp6 = simd_andc(temp4, temp5);
151LAngle = simd_and(temp3, temp6);
152\alert{temp7 = simd_andc(bit[6], bit[7]);
153temp8 = simd_and(temp4, temp7);
154RAngle = simd_and(temp3, temp8);}
155\end{semiverbatim}
156\end{itemize}
157\end{frame}
158
159\begin{frame}[fragile]
160\frametitle{Character Class Ranges}
161\begin{itemize}
162\item Ranges of characters are often very simple to compute.
163\item compile({\tt [CharSet('Control', ['$\backslash$x00-$\backslash$x1F']),
164           {CharSet('Digit', ['0-9']})]]})
165\item 
166\begin{semiverbatim}
167temp1 = simd_or(bit[0], bit[1]);
168temp2 = simd_or(temp1, bit[2]);
169Control = simd_andc(simd_const_1(1), temp2)
170temp3 = simd_and(bit[2], bit[3]);
171temp4 = simd_andc(temp3, temp1);
172temp5 = simd_or(bit[5], bit[6]);
173temp6 = simd_and(bit[4], temp5);
174Digit = simd_andc(temp4, temp6);
175\end{semiverbatim}
176\end{itemize}
177\end{frame}
178
179
180
181\section{Parallel Scanning with Bitstream Addition}
182
183\begin{frame}[fragile]
184\frametitle{From Sequential to Parallel Scanning}
185\begin{block}<+->{Parabix 1: Use Bit Scan Instructions}
186\begin{itemize}
187\item For example, to find markup, bit scan to next 1 in [{\tt <}].
188\item Accelerates scanning, but still sequential.
189\end{itemize}
190\end{block}
191\begin{block}<+->{Parabix 2 : Parallel Scanning Primitive}
192\begin{itemize}
193\item $s(M, C) = (M + C) \wedge \neg C$
194\item $M$ is a stream of marker bits, marking positions to start scans.
195\item $C$ is a character class stream, marking positions to scan through.
196\item Addition and carry propagation moves markers to the left!
197\item Relies on little-endian representation of streams.
198\end{itemize}
199\end{block}
200\end{frame}
201
202
203\begin{frame}[fragile]
204\frametitle{Parallel Scanning Illustrated}
205\begin{itemize}[<+->]
206\item<2-> Given some runs of digits in a text, $T$.
207\item<3-> Given a set of 4 markers, $M_0$.
208\item<4-> Form the digit character class stream, $D$.
209\item<5-> Simply add to advance the markers independently.
210\item<6-> Mask off the garbage.
211\end{itemize}
212\begin{tabular}{rl}
213\onslide<2->$T$     &  {\tt --935---7---29456---7--23--6--} \\
214\onslide<3->$M_0$   &  {\tt ....1...........1.......1...1.} \\
215\onslide<4->$D$     &  {\tt ..111...1...11111...1..11..1..} \\
216\onslide<5->$M_0 +D$ & {\tt .1......1..1........1.1....11.} \\
217\onslide<6->$M_1 = (M_0 + D)
218  \wedge \neg D$ & {\tt .1.........1..........1.....1.}
219\end{tabular}
220\end{frame}
221
222
223\section{Parabix2: XML Parsing Using Parallel Scanning}
224
225\begin{frame}[fragile]
226\frametitle{Example 1:  Parsing Decimal References}
227\begin{block}<+->{Simplified Grammar of Decimal References}
228\begin{center}
229\begin{tabular}{rcl}
230DecRef & ::=   &        '\verb:&#:' Digit${}^{+}$ '\verb:;:'  \\
231Digit  & ::=   &         \verb:[0-9]:
232\end{tabular}
233\end{center}
234\end{block}
235
236\begin{itemize}
237\item Simple example to show parallel scanning in action.
238\item Use [{\tt \&}] stream for marker initialization.
239\item Parse all references in parallel.
240\item Mark all errors using error streams.
241\end{itemize}
242
243
244\end{frame}
245
246\begin{frame}[fragile]
247\frametitle{Decimal Reference Parsing in Action}
248\begin{tabular}{l@{}lr}\\
249\multicolumn{2}{l}{source data $\vartriangleright$}     
250                                         & \verb`-&#978;-&9;--&#;--&#13!-`\\
251$M_0$ &                                  & \verb`.1......1....1....1.....` \pause \\
252$M_1$ & $ = n(M_0)$                      & \verb`..1......1....1....1....` \pause\\
253$E_0$ & $ = M_1 \wedge \neg $\verb:[#]:  & \verb`.........1..............` \pause\\
254$M_2$ & $ = n(M_1 \wedge \neg  E_0)$     & \verb`...1...........1....1...` \\
255$E_1$ & $ = M_2 \wedge \neg  D$          & \verb`...............1........`\pause \\
256$M_3$ & $ = s(M_2 \wedge \neg  E_1, D)$  & \verb`......1...............1.` \pause\\
257$E_2$ & $ = M_3 \wedge \neg  $\verb:[;]: & \verb`......................1.`\\
258$M_4$ & $ = M_3 \wedge \neg  E_2$        & \verb`......1.................`\\
259$E $  & $= E_0 \, | \, E_1 \, | \, E_2$  & \verb`.........1.....1......1.`
260\end{tabular}
261
262
263\end{frame}
264
265
266
267\begin{frame}[fragile]
268\frametitle{Example 2:  Parsing Start Tags}
269\begin{block}<+->{Simplified Grammar of XML Start Tags}
270\begin{center}
271\begin{tabular}{rcl}
272STag         &  ::=   &        '\verb:<:' Name (W  Attribute)* W${}^{?}$ '\verb:>:'  \\
273Attribute & ::=   &        Name W${}^{?}$ '=' W${}^{?}$ AttValue \\
274AttValue  &           ::=   &      (  `\verb:":' \verb:[^<"]*: `\verb:":') $|$ (``\verb:':'' \verb:[^<']*: ``\verb:':'') \\
275        W       &    ::=   &    (\verb:\x20: $|$ \verb:\x9: $|$ \verb:\xD: $|$ \verb:\xA:)${}^{+}$ \\
276%DQuoted & ::= & \verb:[^<"]*:  \\
277%SQuoted & ::= & \verb:[^<']*:
278\end{tabular}
279\end{center}
280\end{block}
281
282\begin{itemize}
283\item More complex, iterative syntax.
284\item Iterate through all tags in parallel.
285\item Number of iterations = max attribute count +1.
286\end{itemize}
287
288
289\end{frame}
290
291\begin{frame}[fragile]
292\frametitle{Start Tag Parsing in Action}
293\begin{center}\scriptsize
294
295\begin{tabular}{lr}\\
296source data $\vartriangleright$ & \verb`--<e a= "137">---<el2 a="17" a2="3379">---<x>--`\\
297$N = $ name chars & \verb`11.1.1...111..111.111.1..11..11..1111..111.1.11`\\
298$W = $ white space & \verb`....1..1.............1......1..................`\\
299$Q = \neg$\verb:["<]: & \verb`11.11111.111.1111.111111.11.1111.1111.1111.1111`\\
300\\
301$M_0$ & \verb`..1..............1........................1....` \pause \\
302$M_1 = n(M_0)$ & \verb`...1..............1........................1...`\\
303$M_{0,7} = s(M_1, N)$ & \verb`....1................1......................1..`\\
304$M_{0,8} = s(M_{0,7}, W) \wedge \neg$\verb:[>]: & \verb`.....1................1........................`\\
305\end{tabular}
306\end{center}
307
308\begin{itemize}
309\item Parse element name.
310\item Find closing {\tt >}, if present.
311\item Otherwise mark positions for attribute-value iteration.
312\end{itemize}
313\end{frame}
314
315
316\begin{frame}[fragile]
317\frametitle{Start Tag Parsing: First Iteration}
318\begin{center}\scriptsize
319
320\begin{tabular}{lr}\\
321source data $\vartriangleright$ & \verb`--<e a= "137">---<el2 a="17" a2="3379">---<x>--`\\
322$N = $ name chars & \verb`11.1.1...111..111.111.1..11..11..1111..111.1.11`\\
323$W = $ white space & \verb`....1..1.............1......1..................`\\
324$Q = \neg$\verb:["<]: & \verb`11.11111.111.1111.111111.11.1111.1111.1111.1111`\\
325\\
326$M_{0,8} $ & \verb`.....1................1........................` \pause \\
327\\
328$M_{1,1} = s(M_{0,8}, N)$ & \verb`......1................1.......................` \pause \\
329$M_{1,2} = s(M_{1,1}, W) \wedge$\verb:[=]: & \verb`......1................1.......................` \pause \\
330$M_{1,3} = n(M_{1,2})$ & \verb`.......1................1......................`\\
331$M_{1,4} = s(M_{1,3}, W) \wedge$\verb:["]: & \verb`........1...............1......................`\pause \\
332$M_{1,5} = n(M_{1,4})$ & \verb`.........1...............1.....................`\\
333$M_{1,6} = s(M_{1,5}, Q) \wedge$\verb:["]: & \verb`............1..............1...................`\pause \\
334$M_{1,7} = n(M_{1,6})$ & \verb`.............1..............1..................`\\
335$M_{1,8} = s(M_{1,7}, W) \wedge \neg$\verb:[>]: & \verb`.............................1.................`\\
336\end{tabular}
337\end{center}
338
339\end{frame}
340
341
342\begin{frame}[fragile]
343\frametitle{Start Tag Parsing: Final Iteration}
344\begin{center}\scriptsize
345
346\begin{tabular}{lr}\\
347source data $\vartriangleright$ & \verb`--<e a= "137">---<el2 a="17" a2="3379">---<x>--`\\
348$N = $ name chars & \verb`11.1.1...111..111.111.1..11..11..1111..111.1.11`\\
349$W = $ white space & \verb`....1..1.............1......1..................`\\
350$Q = \neg$\verb:["<]: & \verb`11.11111.111.1111.111111.11.1111.1111.1111.1111`\\
351\\
352$M_{1,8} $ & \verb`.............................1.................` \pause\\
353\\
354$M_{2,1} = s(M_{1,8}, N)$ & \verb`...............................1...............`\\
355$M_{2,2} = s(M_{2,1}, W) \wedge$\verb:[=]: & \verb`...............................1...............`\\
356$M_{2,3} = n(M_{2,2})$ & \verb`................................1..............`\\
357$M_{2,4} = s(M_{2,3}, W) \wedge$\verb:["]: & \verb`................................1..............`\\
358$M_{2,5} = n(M_{2,4})$ & \verb`.................................1.............`\\
359$M_{2,6} = s(M_{2,5}, Q) \wedge$\verb:["]: & \verb`.....................................1.........`\\
360$M_{2,7} = n(M_{2,6})$ & \verb`......................................1........`\\
361$M_{2,8} = s(M_{2,7}, W) \wedge \neg$\verb:[>]: & \verb`...............................................`
362\end{tabular}
363\end{center}
364
365\end{frame}
366
367
368
369\begin{frame}
370\frametitle{XML Parsing and Well-Formedness Checking}
371
372\begin{itemize}
373\item All XML constructs can be fully parsed using these techniques.
374\item Parse comments, CDATA, processing instructions first.
375\item Form mask bitstream marking the interiors of these constructs.
376\item Remaining {\tt <} and {\tt \&} must be tag and reference starts.
377\item Parsing produces three types of results streams.
378\begin{itemize}
379\item Error streams marking definite errors.
380\item Error check streams marking positions for postprocess checking.
381\item Callout streams marking constructs.
382\end{itemize}
383\end{itemize}
384\end{frame}
385
386
387\section{Compiler Technology}
388
389\begin{frame}[fragile]
390\frametitle{Pablo Compiler Input: Unbounded Stream Equations}
391\begin{center}
392
393\begin{tabular}{r l}
394& \verb`def parse_tags(classes, errors):` \\
395& \verb`  classes.C0 = Alpha` \\
396& \verb`  classes.C1 = Rangle` \\
397& \verb`  classes.C2 = Langle` \\
398& \verb`  L0 = bitutil.Advance(C2)` \\
399& \verb`  errors.E0 = L0 &~ C0` \\
400& \verb`  L1 = bitutil.ScanThru(L0, C0)` \\
401& \verb`  errors.E1 = L1 &~ C1`
402\end{tabular}
403
404\end{center}
405\end{frame}
406
407\begin{frame}[fragile]
408\frametitle{Pablo Compiler Output: Block-Oriented C++}
409
410
411\begin{center}
412\small
413\begin{tabular}{r l}
414& \verb`struct Parse_tags {` \\
415& \verb`  Parse_tags() { CarryInit(carryQ, 2); }` \\
416& \verb`  void do_block(Classes & classes, Errors & errors) {` \\
417& \verb`    BitBlock L0, L1;` \\
418& \verb`    classes.C0 = Alpha;` \\
419& \verb`    classes.C1 = Rangle;` \\
420& \verb`    classes.C2 = Langle;` \\
421& \verb`    L0 = BitBlock_advance_ci_co(C2, carryQ, 0);` \\
422& \verb`    errors.E0 = simd_andc(L0, C0);` \\
423& \verb`    L1 = BitBlock_scanthru_ci_co(L0, C0, carryQ, 1);` \\
424& \verb`    errors.E1 = simd_andc(L1, C1);` \\
425& \verb`    CarryQ_Adjust(carryQ, 2);` \\
426& \verb}` \\
427& \verb`  CarryDeclare(carryQ, 2);` \\
428& \verb`};`
429\end{tabular}
430
431\end{center}
432\end{frame}
433
434\section {Performance Evaluation}
435
436\begin{frame}[fragile]
437\frametitle{XML Document Characteristics}
438{\tiny\begin{table}
439\begin{center}
440\begin{tabular}{|c||r|r|r|r|r|}
441\hline
442File Name               & dewiki.xml            & jawiki.xml            & roads.gml     & po.xml        & soap.xml \\ \hline   
443File Type               & document              & document              & data          & data          & data   \\ \hline     
444File Size (kB)          & 66240                 & 7343                  & 11584         & 76450         & 2717 \\ \hline
445Markup Item Count       & 406792                & 74882                 & 280724        & 4634110       & 18004 \\ \hline
446Markup Density          & 0.07                  & 0.13                  & 0.57          & 0.76          & 0.87  \\ \hline
447\end{tabular}
448\end{center}
449\end{table}}
450\end{frame}
451
452
453\begin{frame}[fragile]
454\frametitle{Execution Time: CPU Cycles per kB}
455\begin{figure}
456\begin{center}
457\includegraphics[width=0.75\textwidth]{plots/corei3_TOT.pdf}
458\end{center}
459\label{corei3_TOT}
460\end{figure}
461\end{frame}
462
463%some of the numbers are roughly calculated, needs to be recalculated for final version
464% \subsubsection{Cache behavior}
465
466\begin{frame}[fragile]
467\frametitle{Cache Misses}
468\begin{figure}
469{
470\centering
471\subfigure[L1 DCache]{
472\includegraphics[width=0.31\textwidth]{plots/corei3_L1DM.pdf}
473}
474\subfigure[L2 DCache]{
475\includegraphics[width=0.31\textwidth]{plots/corei3_L2DM.pdf}
476}
477\subfigure[L3]{
478\includegraphics[width=0.31\textwidth]{plots/corei3_L3CM.pdf}
479}
480}
481\caption{Cache Misses per kB}
482\end{figure}
483\end{frame}
484
485% \subsubsection{Branches \& Branch Mispredictions (per KByte)}
486
487\begin{frame}[fragile]
488\frametitle{Branching Behaviour}
489\begin{figure}
490{
491\centering
492\subfigure[Total Branches]{
493\includegraphics[width=0.45\textwidth]{plots/corei3_BR.pdf}
494}
495\subfigure[Branch Mispredictions]{
496\includegraphics[width=0.45\textwidth]{plots/corei3_BM.pdf}
497}
498}
499\end{figure}
500\end{frame}
501
502\begin{frame}[fragile]
503\frametitle{Percent SIMD Instructions - Parabix 1}
504\begin{figure}
505\begin{center}
506\includegraphics[width=0.75\textwidth]{plots/corei3_INS_p1.pdf}
507\end{center}
508\label{corei3_INS_p1}
509\end{figure}
510\end{frame}
511
512\begin{frame}[fragile]
513\frametitle{Percent SIMD Instructions - Parabix 2}
514\begin{figure}
515\begin{center}
516\includegraphics[width=0.75\textwidth]{plots/corei3_INS_p2.pdf}
517\end{center}
518\label{corei3_INS_p2}
519\end{figure}
520\end{frame}
521
522
523
524\begin{frame}[fragile]
525\frametitle{Multithreaded Parabix}
526\begin{figure}
527\begin{center}
528\includegraphics[width=0.75\textwidth]{plots/pipeline_performance.pdf}
529\end{center}
530\label{multithreaded}
531\end{figure}
532Our latest results show 2X further acceleration using pipeline parallelism on a 4 core machine.
533\end{frame}
534
535\section{Conclusions}
536\begin{frame}[fragile]
537\frametitle{Concluding Remarks}
538\begin{itemize}[<+->]
539\item A new parallel scanning primitive was introduced to
540further accelerate text processing with parallel bit
541stream technology.
542\item In application to XML parsing, the new primitive is
543efficient and effective for all types of XML markup.
544\item Scanning operations on unbounded bitstreams can be
545compiled to efficient block processing code using carry
546variables.
547\item Multithreading of parallel bit stream code with
548parallel scanning is possible using a pipeline parallelism model.
549\end{itemize}
550\end{frame}
551
552
553
554\end{document}
Note: See TracBrowser for help on using the repository browser.