source: docs/EuroPar2011/Pres/europar-slides.tex @ 1401

Last change on this file since 1401 was 1401, checked in by cameron, 8 years ago

Presentation

File size: 18.2 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{comment}
4\usepackage{color}
5\usepackage{subfigure}
6\usepackage[latin1]{inputenc}
7\usepackage{verbatim}
8\usepackage{colortbl}
9\usetheme{Warsaw}
10
11
12\title[Euro-Par 2011]{Parallel Scanning with Bitstream Addition: An XML Case Study}
13\author[Cameron, Amiri, Herdy, Lin, Shermer, Popowich]{Rob Cameron
14\and Ehsan Amiri \and Ken Herdy \and Dan Lin  \and Tom Shermer \and Fred Popowich}
15\institute{School of Computing Science\\
16Simon Fraser University
17\and International Characters, Inc.}
18\date{September 1, 2011}
19\def\CI{Core-i3}
20\def\SB{SandyBridge}
21\def\CO{Core2}
22\newenvironment{smalltabbing}{\endgraf\tiny\tiny\tabbing}{\endtabbing} 
23\newenvironment{tinytinyverbatim}{\endgraf\tiny\tiny\verbatim}{\endverbatim} 
24
25\begin{document}
26
27\begin{frame}
28\titlepage
29\end{frame}
30
31
32
33\begin{frame}
34\frametitle{Outline}
35\tableofcontents%[pausesections]
36\end{frame}
37
38\section{Parallel Bit Stream Technology}
39
40\begin{frame}[fragile]
41\frametitle{The 13th Dwarf: Finite State Machines}
42
43\begin{block}<+->{Berkeley Landscape of Parallel Computing Research}
44\begin{itemize}
45\item 13 dwarves for abstract classes of computing problems.
46\item 13th dwarf: finite state machines, parsing.
47\item Considered hardest dwarf to parallelize:
48\begin{itemize}
49\item ``embarassingly sequential'',
50%\item
51``Nothing helps!''
52\end{itemize}
53\end{itemize}
54\end{block}
55
56\begin{block}<+->{Finite State Transitions at 500 GHz?}
57\begin{itemize}
58\item Use addition for parallel scanning.
59\item Carry propagation between bit positions = FSM transition.
60\item Commodity processors @ 3 GHz, 3 additions/cycle.
61\item 64 carry propagations per addition.
62\item Effective speed for FSM transitions: $3 \times 3 \times 64 > 500$ GHz.
63\end{itemize}
64\end{block}
65
66\end{frame}
67
68
69\begin{frame}
70\frametitle{Parallel Bit Streams: A Transform Representation of Text}
71
72\begin{itemize}[<+->]
73\item Given a byte-oriented character stream $T$, e.g., ``{\tt Ab17;}''.
74\item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$.
75\item Each stream $b_k$ comprises bit $k$ of each byte of $T$.
76\end{itemize}
77\onslide<4->{
78\begin{tabular}{|c|c|c|c|c|c|} \hline
79$T$ & A & b & 1 & 7 & ; \pause \\ \hline
80ASCII & \alert<6>{0}\alert<7>{1}\alert<8>{0}00001 & \alert<6>{0}\alert<7>{1}\alert<8>{1}00010 & \alert<6>{0}\alert<7>{0}\alert<8>{1}10001 & \alert<6>{0}\alert<7>{0}\alert<8>{1}10111 & \alert<6>{0}\alert<7>{0}\alert<8>{1}11011 \pause \\ \hline
81$b_0$ & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} \pause \\ \hline
82$b_1$ & \alert<7>{1} & \alert<7>{1} & \alert<7>{0} & \alert<7>{0} & \alert<7>{0} \pause \\ \hline
83$b_2$ & \alert<8>{0} & \alert<8>{1} & \alert<8>{1} & \alert<8>{1} & \alert<8>{1}  \\ \hline
84$b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline
85$b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline
86$b_5$ & 0 & 0 & 0 & 1 & 0 \\ \hline
87$b_6$ & 0 & 1 & 0 & 1 & 1 \\ \hline
88$b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline
89\end{tabular}
90}
91\end{frame}
92
93
94
95\begin{frame}
96\frametitle{Goal: High Performance Text Processing}
97
98Why form parallel bit streams?
99
100\begin{itemize}[<+->]
101\item Byte-at-a-time text processing is too slow.
102\begin{itemize}[<+->]
103\item Example: XML scan for ``{\tt <}''.
104\item Byte-at-time loop computes only 1 info bit per iteration!
105\end{itemize}
106\item So let's compute those bits in parallel!
107\begin{itemize}[<+->]
108\item Bitwise logic on basis streams $b_i \rightarrow${\tt [<]} stream.
109\item Process 128 positions at a time using SSE registers.
110\end{itemize}
111\item Find next ``{\tt <}'' with {\em bit scan} instruction (e.g., Intel bsf).
112\begin{itemize}[<+->]
113\item Advance up to 63 positions at once.
114\end{itemize}
115%\item Or use parallel scanning with bitstream addition!
116
117\end{itemize}
118\end{frame}
119
120
121\begin{frame}[fragile]
122\frametitle{Character Class Formation}
123\begin{itemize}[<+->]
124\item Combining 8 bits of a code unit gives a character class stream.
125\item compile({\tt [CharDef(LAngle, "<")]})
126\item 
127\begin{semiverbatim}
128temp1 = simd_or(bit[0], bit[1]);
129temp2 = simd_and(bit[2], bit[3]);
130temp3 = simd_andc(temp2, temp1);
131temp4 = simd_and(bit[4], bit[5]);
132temp5 = simd_or(bit[6], bit[7]);
133temp6 = simd_andc(temp4, temp5);
134LAngle = simd_and(temp3, temp6);
135\end{semiverbatim}
136\end{itemize}
137
138\end{frame}
139
140
141\begin{frame}[fragile]
142\frametitle{Character Class Common Subexpressions}
143\begin{itemize}[<+->]
144\item Multiple definitions have common subexpressions.
145\item compile({\tt [CharDef(LAngle, "<"), \\ CharDef(RAngle, "<")]})
146\item 
147\begin{semiverbatim}
148temp1 = simd_or(bit[0], bit[1]);
149temp2 = simd_and(bit[2], bit[3]);
150temp3 = simd_andc(temp2, temp1);
151temp4 = simd_and(bit[4], bit[5]);
152temp5 = simd_or(bit[6], bit[7]);
153temp6 = simd_andc(temp4, temp5);
154LAngle = simd_and(temp3, temp6);
155\onslide<4->temp7 = simd_andc(bit[6], bit[7]);
156\onslide<4->temp8 = simd_and(temp4, temp7);
157\onslide<4->RAngle = simd_and(temp3, temp8);
158\end{semiverbatim}
159\end{itemize}
160\end{frame}
161
162\begin{frame}[fragile]
163\frametitle{Character Class Ranges}
164\begin{itemize}[<+->]
165\item Ranges of characters are often very simple to compute.
166\item compile({\tt [CharSet('Control', ['$\backslash$x00-$\backslash$x1F']),
167           \onslide<4->{CharSet('Digit', ['0-9']})]]})
168\item 
169\begin{semiverbatim}
170temp1 = simd_or(bit[0], bit[1]);
171temp2 = simd_or(temp1, bit[2]);
172Control = simd_andc(simd_const_1(1), temp2)
173\onslide<5->temp3 = simd_and(bit[2], bit[3]);
174\onslide<5->temp4 = simd_andc(temp3, temp1);
175\onslide<5->temp5 = simd_or(bit[5], bit[6]);
176\onslide<5->temp6 = simd_and(bit[4], temp5);
177\onslide<5->Digit = simd_andc(temp4, temp6);
178\end{semiverbatim}
179\end{itemize}
180\end{frame}
181
182
183
184\section{Parallel Scanning with Bitstream Addition}
185
186\begin{frame}[fragile]
187\frametitle{From Sequential to Parallel Scanning}
188\begin{itemize}[<+->]
189\item Parabix 1: use bit scan instructions throughout.
190\begin{itemize}
191\item For example, to find markup, bit scan to next 1 in [{\tt <}].
192\item Accelerates scanning, but still sequential.
193\end{itemize}
194
195\item Parabix 2: use new parallel scanning primitive.
196\begin{itemize}
197\item $s(M, C) = (M + C) \wedge \neg C$
198\item $M$ is a stream of marker bits, marking positions to start scans.
199\item $C$ is a character class stream, marking positions to scan through.
200\item Addition and carry propagation moves markers to the left!
201\item Relies on little-endian representation of streams.
202\end{itemize}
203\end{itemize}
204\end{frame}
205
206
207\begin{frame}[fragile]
208\frametitle{Parallel Scanning Illustrated}
209\begin{itemize}[<+->]
210\item<2-> Given some runs of digits in a text, $T$.
211\item<3-> Given a set of 4 markers, $M_0$.
212\item<4-> Form the digit character class stream, $D$.
213\item<5-> Simply add to advance the markers independently.
214\item<6-> Mask off the garbage.
215\end{itemize}
216\begin{tabular}{rl}
217\onslide<2->$T$     &  {\tt --935---7---29456---7--23--6--} \\
218\onslide<3->$M_0$   &  {\tt ....1...........1.......1...1.} \\
219\onslide<3->$D$     &  {\tt ..111...1...11111...1..11..1..} \\
220\onslide<5->$M_0 +D$ & {\tt .1......1..1........1.1....11.} \\
221\onslide<6->$M_1 = (M_0 + D)
222  \wedge \neg D$ & {\tt .1.........1..........1.....1.}
223\end{tabular}
224\end{frame}
225
226
227\section{Parabix2: XML Parsing Using Parallel Scanning}
228
229\begin{frame}[fragile]
230\frametitle{Example 1:  Parsing Decimal References}
231\begin{block}<+->{Simplified Grammar of Decimal References}
232\begin{center}
233\begin{tabular}{rcl}
234DecRef & ::=   &        '\verb:&#:' Digit${}^{+}$ '\verb:;:'  \\
235Digit  & ::=   &         \verb:[0-9]:
236\end{tabular}
237\end{center}
238\end{block}
239
240\begin{itemize}
241\item Simple example to show parallel scanning in action.
242\item Use [{\tt \&}] stream for marker initialization.
243\item Parse all references in parallel.
244\item Mark all errors using error streams.
245\end{itemize}
246
247
248\end{frame}
249
250\begin{frame}[fragile]
251\frametitle{Decimal Reference Parsing in Action}
252\begin{tabular}{l@{}lr}\\
253\multicolumn{2}{l}{source data $\vartriangleright$}     
254                                         & \verb`-&#978;-&9;--&#;--&#13!-`\\
255$M_0$ &                                  & \verb`.1......1....1....1.....` \pause \\
256$M_1$ & $ = n(M_0)$                      & \verb`..1......1....1....1....` \pause\\
257$E_0$ & $ = M_1 \wedge \neg $\verb:[#]:  & \verb`.........1..............` \pause\\
258$M_2$ & $ = n(M_1 \wedge \neg  E_0)$     & \verb`...1...........1....1...` \\
259$E_1$ & $ = M_2 \wedge \neg  D$          & \verb`...............1........`\pause \\
260$M_3$ & $ = s(M_2 \wedge \neg  E_1, D)$  & \verb`......1...............1.` \pause\\
261$E_2$ & $ = M_3 \wedge \neg  $\verb:[;]: & \verb`......................1.`\\
262$M_4$ & $ = M_3 \wedge \neg  E_2$        & \verb`......1.................`\\
263$E $  & $= E_0 \, | \, E_1 \, | \, E_2$  & \verb`.........1.....1......1.`
264\end{tabular}
265
266
267\end{frame}
268
269
270
271\begin{frame}[fragile]
272\frametitle{Example 2:  Parsing Start Tags}
273\begin{block}<+->{Simplified Grammar of XML Start Tags}
274\begin{center}
275\begin{tabular}{rcl}
276STag         &  ::=   &        '\verb:<:' Name (W  Attribute)* W${}^{?}$ '\verb:>:'  \\
277Attribute & ::=   &        Name W${}^{?}$ '=' W${}^{?}$ AttValue \\
278AttValue  &           ::=   &      (  `\verb:":' \verb:[^<"]*: `\verb:":') $|$ (``\verb:':'' \verb:[^<']*: ``\verb:':'') \\
279        W       &    ::=   &    (\verb:\x20: $|$ \verb:\x9: $|$ \verb:\xD: $|$ \verb:\xA:)${}^{+}$ \\
280%DQuoted & ::= & \verb:[^<"]*:  \\
281%SQuoted & ::= & \verb:[^<']*:
282\end{tabular}
283\end{center}
284\end{block}
285
286\begin{itemize}
287\item More complex, iterative syntax.
288\item Iterate through all tags in parallel.
289\item Number of iterations = max attribute count +1.
290\end{itemize}
291
292
293\end{frame}
294
295\begin{frame}[fragile]
296\frametitle{Start Tag Parsing in Action}
297\begin{center}\scriptsize
298
299\begin{tabular}{lr}\\
300source data $\vartriangleright$ & \verb`--<e a= "137">---<el2 a="17" a2="3379">---<x>--`\\
301$N = $ name chars & \verb`11.1.1...111..111.111.1..11..11..1111..111.1.11`\\
302$W = $ white space & \verb`....1..1.............1......1..................`\\
303$Q = \neg$\verb:["<]: & \verb`11.11111.111.1111.111111.11.1111.1111.1111.1111`\\
304\\
305$M_0$ & \verb`..1..............1........................1....` \pause \\
306$M_1 = n(M_0)$ & \verb`...1..............1........................1...`\\
307$M_{0,7} = s(M_1, N)$ & \verb`....1................1......................1..`\\
308$M_{0,8} = s(M_{0,7}, W) \wedge \neg$\verb:[>]: & \verb`.....1................1........................`\\
309\end{tabular}
310\end{center}
311
312\begin{itemize}
313\item Parse element name.
314\item Find closing {\tt >}, if present.
315\item Otherwise mark positions for attribute-value iteration.
316\end{itemize}
317\end{frame}
318
319
320\begin{frame}[fragile]
321\frametitle{Start Tag Parsing: FIrst Iteration}
322\begin{center}\scriptsize
323
324\begin{tabular}{lr}\\
325source data $\vartriangleright$ & \verb`--<e a= "137">---<el2 a="17" a2="3379">---<x>--`\\
326$N = $ name chars & \verb`11.1.1...111..111.111.1..11..11..1111..111.1.11`\\
327$W = $ white space & \verb`....1..1.............1......1..................`\\
328$Q = \neg$\verb:["<]: & \verb`11.11111.111.1111.111111.11.1111.1111.1111.1111`\\
329\\
330$M_{0,8} $ & \verb`.....1................1........................` \pause \\
331\\
332$M_{1,1} = s(M_{0,8}, N)$ & \verb`......1................1.......................` \pause \\
333$M_{1,2} = s(M_{1,1}, W) \wedge$\verb:[=]: & \verb`......1................1.......................` \pause \\
334$M_{1,3} = n(M_{1,2})$ & \verb`.......1................1......................`\\
335$M_{1,4} = s(M_{1,3}, W) \wedge$\verb:["]: & \verb`........1...............1......................`\pause \\
336$M_{1,5} = n(M_{1,4})$ & \verb`.........1...............1.....................`\\
337$M_{1,6} = s(M_{1,5}, Q) \wedge$\verb:["]: & \verb`............1..............1...................`\pause \\
338$M_{1,7} = n(M_{1,6})$ & \verb`.............1..............1..................`\\
339$M_{1,8} = s(M_{1,7}, W) \wedge \neg$\verb:[>]: & \verb`.............................1.................`\\
340\end{tabular}
341\end{center}
342
343\end{frame}
344
345
346\begin{frame}[fragile]
347\frametitle{Start Tag Parsing: Final Iteration}
348\begin{center}\scriptsize
349
350\begin{tabular}{lr}\\
351source data $\vartriangleright$ & \verb`--<e a= "137">---<el2 a="17" a2="3379">---<x>--`\\
352$N = $ name chars & \verb`11.1.1...111..111.111.1..11..11..1111..111.1.11`\\
353$W = $ white space & \verb`....1..1.............1......1..................`\\
354$Q = \neg$\verb:["<]: & \verb`11.11111.111.1111.111111.11.1111.1111.1111.1111`\\
355\\
356$M_{1,8} $ & \verb`.............................1.................` \pause\\
357\\
358$M_{2,1} = s(M_{1,8}, N)$ & \verb`...............................1...............`\\
359$M_{2,2} = s(M_{2,1}, W) \wedge$\verb:[=]: & \verb`...............................1...............`\\
360$M_{2,3} = n(M_{2,2})$ & \verb`................................1..............`\\
361$M_{2,4} = s(M_{2,3}, W) \wedge$\verb:["]: & \verb`................................1..............`\\
362$M_{2,5} = n(M_{2,4})$ & \verb`.................................1.............`\\
363$M_{2,6} = s(M_{2,5}, Q) \wedge$\verb:["]: & \verb`.....................................1.........`\\
364$M_{2,7} = n(M_{2,6})$ & \verb`......................................1........`\\
365$M_{2,8} = s(M_{2,7}, W) \wedge \neg$\verb:[>]: & \verb`...............................................`
366\end{tabular}
367\end{center}
368
369\end{frame}
370
371
372
373\begin{frame}
374\frametitle{XML Parsing Plus Well-Formedness}
375
376\begin{itemize}[<+->]
377\item All XML constructs can be fully parsed using these techniques.
378\item Comments, CDATA, processing instructions must be done first.
379\item Form mask bitstream markingthe interiors of these constructs.
380\item Remaining {\tt <} and {\tt \&} must be tag and reference starts.
381\item Parsing produces three types of results streams.
382\begin{itemize}[<+->]
383\item Error streams marking definite errors.
384\item Error check streams marking positions for postprocess checking.
385\item Callout streams marking constructs.
386\end{itemize}
387\end{itemize}
388\end{frame}
389
390
391\section{Compiler Technology}
392
393\begin{frame}[fragile]
394\frametitle{Pablo Compiler Input: Unbounded Stream Equations}
395\begin{center}
396
397\begin{tabular}{r l}
398& \verb`def parse_tags(classes, errors):` \\
399& \verb`  classes.C0 = Alpha` \\
400& \verb`  classes.C1 = Rangle` \\
401& \verb`  classes.C2 = Langle` \\
402& \verb`  L0 = bitutil.Advance(C2)` \\
403& \verb`  errors.E0 = L0 &~ C0` \\
404& \verb`  L1 = bitutil.ScanThru(L0, C0)` \\
405& \verb`  errors.E1 = L1 &~ C1`
406\end{tabular}
407
408\end{center}
409\end{frame}
410
411\begin{frame}[fragile]
412\frametitle{Pablo Compiler Output: Block-Oriented C++}
413
414
415\begin{center}
416\small
417\begin{tabular}{r l}
418& \verb`struct Parse_tags {` \\
419& \verb`  Parse_tags() { CarryInit(carryQ, 2); }` \\
420& \verb`  void do_block(Classes & classes, Errors & errors) {` \\
421& \verb`    BitBlock L0, L1;` \\
422& \verb`    classes.C0 = Alpha;` \\
423& \verb`    classes.C1 = Rangle;` \\
424& \verb`    classes.C2 = Langle;` \\
425& \verb`    L0 = BitBlock_advance_ci_co(C2, carryQ, 0);` \\
426& \verb`    errors.E0 = simd_andc(L0, C0);` \\
427& \verb`    L1 = BitBlock_scanthru_ci_co(L0, C0, carryQ, 1);` \\
428& \verb`    errors.E1 = simd_andc(L1, C1);` \\
429& \verb`    CarryQ_Adjust(carryQ, 2);` \\
430& \verb}` \\
431& \verb`  CarryDeclare(carryQ, 2);` \\
432& \verb`};`
433\end{tabular}
434
435\end{center}
436\end{frame}
437
438\section {Performance Evaluation}
439
440\begin{frame}[fragile]
441\frametitle{XML Document Characteristics}
442{\tiny\begin{table}
443\begin{center}
444\begin{tabular}{|c||r|r|r|r|r|}
445\hline
446File Name               & dewiki.xml            & jawiki.xml            & roads.gml     & po.xml        & soap.xml \\ \hline   
447File Type               & document              & document              & data          & data          & data   \\ \hline     
448File Size (kB)          & 66240                 & 7343                  & 11584         & 76450         & 2717 \\ \hline
449Markup Item Count       & 406792                & 74882                 & 280724        & 4634110       & 18004 \\ \hline
450Markup Density          & 0.07                  & 0.13                  & 0.57          & 0.76          & 0.87  \\ \hline
451\end{tabular}
452\end{center}
453\end{table}}
454\end{frame}
455
456
457\begin{frame}[fragile]
458\frametitle{Performance: CPU Cycles per kB}
459\begin{figure}
460\begin{center}
461\includegraphics[width=0.75\textwidth]{plots/corei3_TOT.pdf}
462\end{center}
463\label{corei3_TOT}
464\end{figure}
465\end{frame}
466
467%some of the numbers are roughly calculated, needs to be recalculated for final version
468% \subsubsection{Cache behavior}
469
470\begin{frame}[fragile]
471\frametitle{Cache Misses}
472\begin{figure}
473{
474\centering
475\subfigure[L1 DCache]{
476\includegraphics[width=0.31\textwidth]{plots/corei3_L1DM.pdf}
477}
478\subfigure[L2 DCache]{
479\includegraphics[width=0.31\textwidth]{plots/corei3_L2DM.pdf}
480}
481\subfigure[L3]{
482\includegraphics[width=0.31\textwidth]{plots/corei3_L3CM.pdf}
483}
484}
485\caption{Cache Misses per kB}
486\end{figure}
487\end{frame}
488
489% \subsubsection{Branches \& Branch Mispredictions (per KByte)}
490
491\begin{frame}[fragile]
492\frametitle{Branching Behaviour}
493\begin{figure}
494{
495\centering
496\subfigure[Total Branches]{
497\includegraphics[width=0.45\textwidth]{plots/corei3_BR.pdf}
498}
499\subfigure[Branch Mispredictions]{
500\includegraphics[width=0.45\textwidth]{plots/corei3_BM.pdf}
501}
502}
503\end{figure}
504\end{frame}
505
506\begin{frame}[fragile]
507\frametitle{Parabix1 SIMD Instruction Ratio (percent)}
508\begin{figure}
509\begin{center}
510\includegraphics[width=0.75\textwidth]{plots/corei3_INS_p1.pdf}
511\end{center}
512\label{corei3_INS_p1}
513\end{figure}
514\end{frame}
515
516\begin{frame}[fragile]
517\frametitle{Parabix2 SIMD Instruction Ratio (percent)}
518\begin{figure}
519\begin{center}
520\includegraphics[width=0.75\textwidth]{plots/corei3_INS_p2.pdf}
521\end{center}
522\label{corei3_INS_p2}
523\end{figure}
524\end{frame}
525
526
527\begin{frame}[fragile]
528\frametitle{Performance Scaling (CPU Cycles per kB)}
529\begin{figure}
530\centering
531\subfigure[Parabix2]{
532\includegraphics[width=0.40\textwidth]{plots/P2_scalability.pdf}
533}
534\subfigure[Expat]{
535\includegraphics[width=0.40\textwidth]{plots/Expat_scalability.pdf}
536}
537\label{Scalability}
538\end{figure}
539\end{frame}
540
541
542
543\begin{frame}[fragile]
544\frametitle{Parabix2 with 256-bit AVX}
545\begin{figure}
546\begin{center}
547\includegraphics[width=0.75\textwidth]{plots/avx.pdf}
548\end{center}
549\label{parabix-avx}
550\end{figure}
551\end{frame}
552
553
554\begin{frame}[fragile]
555\frametitle{Multithreaded Parabix}
556\begin{figure}
557\begin{center}
558\includegraphics[width=0.75\textwidth]{plots/pipeline_performance.pdf}
559\end{center}
560\label{multithreaded}
561\end{figure}
562\end{frame}
563
564\section{Conclusions}
565\begin{frame}[fragile]
566\frametitle{Concluding Remarks}
567\begin{itemize}[<+->]
568\item A new parallel scanning primitive was introduced to
569further accelerate text processing with parallel bit
570stream technology.
571\item In application to XML parsing, the new primitive is
572efficient and effective for all types of XML markup.
573\item Scanning operations on unbounded bitstreams can be
574compiled to efficient block processing code using carry
575variables.
576\item Parallel scanning techniques may benefit from
577increasing SIMD register widths such as 256-bit AVX.
578\item Multithreading of parallel bit stream code with
579parallel scanning is possible using a pipeline parallelism model.
580\end{itemize}
581\end{frame}
582
583
584
585\end{document}
Note: See TracBrowser for help on using the repository browser.