# source:docs/Working/re/Slides/pactslides.tex@4084

Last change on this file since 4084 was 4084, checked in by cameron, 5 years ago

Final version

File size: 18.1 KB
Line
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{color}
4\usepackage[latin1]{inputenc}
5\usepackage{verbatim}
6\usepackage{colortbl}
7\usepackage{graphicx}
8\usepackage{tikz}
9\usepackage{multirow}
10\usepackage{pgfplots}
11
12\usetheme{Warsaw}
14
15\title[PACT 2014: Parabix grep]{Bitwise Data Parallelism in Regular Expression Matching}
16\author[Cameron, Shermer, Shriraman, Herdy, Lin, Hull, and Lin]{Rob Cameron, Tom Shermer, Arrvindh Shriraman, Ken Herdy, Dan Lin, Ben Hull, Meng Lin}
17\institute[CS@SFU]{School of Computing Science\\
18Simon Fraser University}
19\date{August 25, 2014}
20
21\begin{document}
22
23\begin{frame}
24\titlepage
25\end{frame}
26
27
28\begin{frame}
29\frametitle{Outline}
30\tableofcontents%[pausesections]
31\end{frame}
32
33\section{Introduction}
34
35\begin{frame}[fragile]
36\frametitle{Acceleration of Regular Expression Matching}
37
38\begin{block}<+->{Sequential Matching Problems}
39\begin{itemize}%[<+->]
40\item Example: quickly find instances of \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]): in text. 41\item Three families of sequential algorithms: NFAs, DFAs or backtracking. 42\end{itemize} 43\end{block} 44\begin{block}<+->{Parallelization} 45\begin{itemize} 46\item Berkeley ParLab: 47\begin{itemize} 48\item FSMs are 13th dwarf: hardest to parallelize. 49\item Embarassingly sequential? 50\end{itemize} 51\item Parallel application of FSMs to multiple input streams. 52\item Recent advances: coalesced FSMs, principled speculation. 53\end{itemize} 54\end{block} 55\end{frame} 56 57 58\begin{frame}[fragile] 59\frametitle{Our Approach} 60 61\begin{itemize}%[<+->] 62\item A fundamentally parallel approach in contrast to approaches 63that parallelize existing DFA or NFA algorithms. 64 65\item A new algorithm {\em family} for regular expression matching based 66on bitwise data parallelism. 67 68\item Builds on the Parabix methods that have been used for XML 69parsing and Unicode transcoding. 70 71\item Introduces the MatchStar primitive 72for simultaneous nondeterministic matching 73of character class repetitions using bitstream addition. 74 75\item Adds compilation technologies for regular expressions to Parabix 76 technology. 77 78\item Recent work: all compilers integrated together with LLVM for 79fully dynamic regular expression matching. 80\end{itemize} 81\end{frame} 82 83\begin{frame}[fragile] 84\frametitle{Bitwise Data Parallelism} 85 86\begin{itemize}%[<+->] 87\item Parabix methods use a transform representation of text. 88\item Bitstreams are formed using one bit per input byte. 89\item Eight basis bit streams are defined for bit 0, bit 1, ... bit 7 of each byte. 90\item Perform bitwise processing with wide SIMD registers. 91\begin{itemize} 92\item Process 128 bytes at a time with SSE2, Neon, Altivec. 93\item Process 256 bytes at a time with AVX2. 94\end{itemize} 95\item Parabix transform is fast using SIMD pack operations. 96\item {\em But}: programmer view is of unbounded, arbitrary length bitstreams. 97\end{itemize} 98 99\end{frame} 100 101\begin{frame}[fragile] 102\frametitle{Impressive Results in Full Unicode Matching} 103 104\begin{itemize}%[<+->] 105\item Example: \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]):
106\begin{itemize}
107\item Find capitalized words at ends of sentences.
108\item Use Unicode upper/lower case categories.
109\item Match against 110 MB Arabic wikimedia document.
110\end{itemize}
111\item \verb:egrep: 45,951,194,784 CPU cycles.
112\item \verb:pcregrep: 14,772,797,548 CPU cycles.
113\item \verb:icgrep: (Parabix) 653,530,064 CPU cycles.
114\item 20X acceleration over \verb:pcgregrep:, 70X over GNU \verb:egrep:.
115\end{itemize}
116\end{frame}
117
118
119\section{Parabix Technology}
120
121\begin{frame}
122\frametitle{Parallel Bit Streams: A Transform Representation of Text}
123\begin{itemize}
124\item Given a byte-oriented character stream $T$, e.g., {\tt Ab17;}''.
125\item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$.
126\item Each stream $b_k$ comprises bit $k$ of each byte of $T$.
127\end{itemize}
128\begin{tabular}{|c|c|c|c|c|c|} \hline
129$T$ & A & b & 1 & 7 & ; \\ \hline
131$b_0$ & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} \pause \\ \hline
132$b_1$ & \alert<3>{1} & \alert<3>{1} & \alert<3>{0} & \alert<3>{0} & \alert<3>{0} \pause \\ \hline
133$b_2$ & \alert<4>{0} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1}  \\ \hline
134$b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline
135$b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline
136$b_5$ & 0 & 0 & 0 & 1 & 0 \\ \hline
137$b_6$ & 0 & 1 & 0 & 1 & 1 \\ \hline
138$b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline
139\end{tabular}
140\end{frame}
141
142\begin{frame}
143\frametitle{Parabix Programming}
144
145\begin{block}<+->{Programmer View}
146
147\begin{itemize}%[<+->]
148\item Parabix programs written as unbounded bit stream operations.
149\item Unbounded bit streams considered as arbitrarily large integers.
150\item Fundamental operations: bitwise logic, bit-stream advance (shift) and
152\end{itemize}
153\end{block}
154\begin{block}<+->{Parabix Tool Chain}
155\begin{itemize}
156\item Character Class Compiler (CCC) produces stream equations from character classes.
157\item Parallel Block Compiler (Pablo) converts unbounded stream programs to C++/SIMD.
158\item Portable SIMD library for multiple architectures.
159\end{itemize}
160\end{block}
161\end{frame}
162
163\begin{frame}[fragile]
164\frametitle{Character Class Formation}
165\begin{itemize}%[<+->]
166\item Combining 8 bits of a code unit gives a character class stream.
167\item CCC({\tt cc\_a = [a]})
168\item
169\begin{semiverbatim}
170temp1 = (bit[1] &~ bit[0])
171temp2 = (bit[2] &~ bit[3])
172temp3 = (temp1 & temp2)
173temp4 = (bit[4] | bit[5])
174temp5 = (bit[7] &~ bit[6])
175temp6 = (temp5 &~ temp4)
176cc\_a = (temp3 & temp6)
177\end{semiverbatim}
178\end{itemize}
179
180\end{frame}
181
182\begin{frame}[fragile]
183\frametitle{Character Class Ranges}
184\begin{itemize}%[<+->]
185\item Ranges of characters are often very simple to compute.
186\item CCC({\tt cc\_0\_9 = [0-9]})
187\item
188\begin{semiverbatim}
189temp7 = (bit[0] | bit[1])
190temp8 = (bit[2] & bit[3])
191temp9 = (temp8 &~ temp7)
192temp10 = (bit[5] | bit[6])
193temp11 = (bit[4] & temp10)
194cc\_0\_9 = (temp9 &~ temp11)
195\end{semiverbatim}
196\end{itemize}
197\end{frame}
198
199
200\begin{frame}[fragile]
201\frametitle{Character Class Common Subexpressions}
202\begin{itemize}
203\item Multiple definitions use common subexpressions.
204\item CCC({\tt cc\_z9 = [z9]})
205\item
206\begin{semiverbatim}
207temp12 = (bit[4] &~ bit[5])
208temp13 = (temp12 & temp5)
209temp14 = (temp9 & temp13)
210temp15 = (temp1 & temp8)
211temp16 = (bit[6] &~ bit[7])
212temp17 = (temp12 & temp16)
213temp18 = (temp15 & temp17)
214cc\_z9 = (temp14 | temp18)
215\end{semiverbatim}
216\end{itemize}
217\end{frame}
218
219\section{Regular Expression Matching with Parabix}
220
221\begin{frame}[fragile]
222\frametitle{Marker Streams}
223\begin{itemize}%[<+->]
224\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
225\item Each marker stream $M_i$ has one bit for every input byte in the input file.
226\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
227including item $i$ in the expression occurs at position $j-1$ in the input stream.
228\item Conceptually, marker streams are computed in parallel for all positions in the file
229at once (bitwise data parallelism).
230\item In practice, marker streams are computed block-by-block, where the
231block size is the size of a SIMD register in bits.
232\end{itemize}
233\end{frame}
234
235\begin{frame}[fragile]
236\frametitle{Marker Stream Example}
237\begin{itemize}[<+->]
238\item Consider matching regular expression \verba[0-9]*[z9] against the input text below.
239\item<2->$M_1$ marks positions after occurrences of \verb:a:.
240\item<3->$M_2$ marks positions after occurrences of \verb:a[0-9]*:.
241\item<4->$M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
242\end{itemize}
243
244\begin{tabular}{cr}\\
245input data  & \verba453z--b3z--az--a12949z--ca22z7--\\
246\onslide<2->$M_1$ & \verb.1...........1...1.........1.....\\
247\onslide<3->$M_2$ & \verb.1111........1...111111....111...\\
248\onslide<4->$M_3$ & \verb.....1........1.....1.11......1..
249\end{tabular}
250\end{frame}
251
252\begin{frame}[fragile]
253\frametitle{Matching Character Class Repetitions with MatchStar}
254\begin{itemize}[<+->]
255\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C) \oplus C) \vee M$
256\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
257\item<4->Use addition to scan each marker through the class.
258\item<5->Bits that change represent matches.
259\item<6->We also have matches at start positions in $M_1$.
260\end{itemize}
261
262\begin{tabular}{cr}\\
263\onslide<3->input data  & \verba453z--b3z--az--a12949z--ca22z7--\\
264\onslide<3->$M_1$ & \verb.1...........1...1.........1.....\\
265\onslide<3->$C = \text{\tt [0-9]}$ & \verb.111....1........11111.....11.1..\\
266\onslide<4->$T_0 = M_1 \wedge C$ & \verb.1...............1.........1.....\\
267\onslide<4->$T_1 = T_0 + C$ & \verb....1...1.............1......11..\\
268\onslide<5->$T_2 = T_1 \oplus C$ & \verb.1111............111111....111...\\
269\onslide<6->$M_2 = T_2 \vee M_1$ & \verb.1111........1...111111....111...
270\end{tabular}
271\end{frame}
272
273
274\begin{frame}[fragile]
275\frametitle{Regular Expression Compilation}
276\begin{itemize}
277\item Our regular expression compiler produces unbounded Pablo code.
278\item RE\_compile(\verba[0-9]*[z9])
279\item
280\begin{semiverbatim}
281m0 = ~0
283m2 = pablo.MatchStar(m1, cc\_0\_9)
285\end{semiverbatim}
286\end{itemize}
287\end{frame}
288
289\begin{frame}[fragile]
290\frametitle{Alternations and Optional Terms}
291\begin{itemize}
292\item Most RE features are handled naturally.
293\item RE\_compile(\verba(b?|cd))
294\item
295\begin{semiverbatim}
296m0 = ~0
298m2 = pablo.MatchStar(m1, cc\_b)
299m3 = m1 | m2      # b is optional
302m6 = m3 | m5      # two alternatives
303\end{semiverbatim}
304\end{itemize}
305\end{frame}
306
307
308\begin{frame}[fragile]
309\frametitle{Nested Repetitions Use While Loops}
310\begin{itemize}
311\item While loops are used for complex or nested repetitions.
312\item RE\_compile(\verb(a[0-9]*[z9])*)
313\item
314\begin{semiverbatim}
315m0 = ~0
316t = m0   # while test variable
317a = m0   # while result accumulator
318while t:
319   m1 = pablo.Advance(t & cc\_a)
320   m2 = pablo.MatchStar(m1, cc\_0\_9)
322   t = m3 &~ a   # iterate only for new matches
323   a = a | m3
324\end{semiverbatim}
325\end{itemize}
326\end{frame}
327
328\section{Block-by-Block Processing}
329
330\begin{frame}[fragile]
331\frametitle{Pablo Compiler Compiles to Block-by-Block Form}
332\begin{block}<+->{Pablo Input}
333\begin{semiverbatim}
334m0 = ~0
336m2 = pablo.MatchStar(m1, cc\_0\_9)
338\end{semiverbatim}
339
340\end{block}
342\begin{semiverbatim}
343m0 = simd_not(simd<1>::constant<0>());
344cryQ.set(0, blkShl(blkAnd(m0, c\c_a), cryQ.get(0), m1);
345cryQ.set(1, blkMatchStar(m1, cc\_0\_9, cryQ.get(1), m2);
346cryQ.set(2, blkShl(blkAnd(m2, cc\_z9), cryQ.get(2), m3);
348\end{semiverbatim}
349\end{block}
350\end{frame}
351
352
353\begin{frame}[fragile]
355\begin{itemize}%[<+->]
356\item Processors limit addition to 64 bits.
357\item SIMD blocks may be 128, 256 or (soon) 512 bits.
359  carries between 64-bit fields.
360\item However, a parallel method is possible.
361\begin{itemize}
362\item Extract one carry and one bubble'' bit each from 64-bit
363  fields.
364\item Apply MatchStar to produce a mask of increment'' bits.
366\end{itemize}
368\end{itemize}
369\end{frame}
370
371\begin{frame}[fragile]
373Compute the long-stream sum {\tt Z} = {\tt X} + {\tt Y}.
374\begin{tabular}{c||r|r|r|r|r|r|r|r||}\cline{2-9}
375{\tt X} & {\tt 19} & {\tt 31} & {\tt BA} & {\tt 4C} & {\tt 3D} & {\tt 45} & {\tt 21} & {\tt F1} \\ \cline{2-9}
376{\tt Y} & {\tt 22} & {\tt 12} & {\tt 45} & {\tt B3} & {\tt E2} & {\tt 16} & {\tt 17} & {\tt 36} \\ \cline{2-9}
377\alert<3>{\tt R} (add digits)& {\tt 3B} & {\tt 43} & \alert<3>{\tt FF} & \alert<3>{\tt FF} & {\tt 1F} & {\tt 5B} & {\tt 38} & {\tt 27} \\ \cline{2-9}
378{\tt x} = high\_bit({\tt X})& {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} \\ \cline{2-9}
379{\tt y} = high\_bit({\tt Y}) & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
380{\tt r} = high\_bit({\tt Z}) & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
381\alert<4>{\tt c} (carries)& {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} & \alert<4>{\tt 1} & {\tt 0} & {\tt 0} & \alert<4>{\tt 1} \\ \cline{2-9}
382{\tt c*2} (shift left 1)& {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} \\ \cline{2-9}
383\alert<5>{\tt b} (bubble ({\tt Z} = {\tt FF}) & {\tt 0} & {\tt 0} & \alert<5>{\tt 1} & \alert<5>{\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
384\alert<6>{\tt i} = Matchstar({\tt b},{\tt c*2})& {\tt 0} & \alert<6>{\tt 1} &\alert<6> {\tt 1} & \alert<6>{\tt 1} & {\tt 0} & {\tt 0} & \alert<6>{\tt 1} & {\tt 0} \\ \cline{2-9}
385\alert<7>{\tt Z} = {\tt R} + {\tt i} & {\tt 3B} &  \alert<7>{\tt 44} & \alert<7>{\tt 0} & \alert<7>{\tt 0} & {\tt 1F} & {\tt 5B} & \alert<7>{\tt 39} & {\tt 27} \\ \cline{2-9}
386\end{tabular}
387\end{frame}
388
389
390\section{Performance}
391
392
393
394
395
396\begin{frame}[fragile]
397\frametitle{Experimental Evaluation}
398\begin{block}<+->{Compare Parabix grep vs. Alternatives}
399\begin{itemize}
400\item nrgrep (NFA-based)
401\item re2grep (DFA-based)
402\item GNU egrep, pcregrep
403\end{itemize}
404\end{block}
405\begin{block}<+->{Test Expressions}
406
407
408{\tiny
409\begin{tabular}{c|c}
410\hline
411Name            & Expression    \\ \hline
412@               & \verb@              \\ \hline
413Date            & \verb([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)          \\ \hline
414Email           & \verb([^ @]+)@([^ @]+)              \\ \hline
415URI     & \verb(([a-zA-Z][a-zA-Z0-9]*)://|mailto:)([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)         \\ \hline
416Hex             & \verb[ ](0x)?([a-fA-F0-9][a-fA-F0-9])+[.:,?! ]              \\ \hline
417StarHt          & \verb[A-Z]((([a-zA-Z]*a[a-zA-Z]*[ ])*[a-zA-Z]*e[a-zA-Z]*[ ])*[a-zA-Z]*s[a-zA-Z]*[ ])*[.?!]          \\ \hline
418\end{tabular}
419}
420\end{block}
421
422\end{frame}
423
424
425\begin{frame}[fragile]
426\frametitle{SSE2 Performance}
427\begin{tikzpicture}
428\begin{axis}[
429xtick=data,
430ylabel=Cycles per Byte,
431xticklabels={@,Date,Email,URI,Hex,StarHeight},
432tick label style={font=\tiny},
433enlarge x limits=0.15,
434%enlarge y limits={0.15, upper},
435ymin=0,
436legend style={at={(0.5,0.9)},
437anchor=north,legend columns=-1},
438ymax=44,
439ybar,
440bar width=7pt,
441%visualization depends on=y \as \rawy,
442]
444file {../data/cycles-bitstreams.dat};
446file {../data/cycles-nrgrep112.dat};
448file {../data/cycles-gre2p.dat};
449
450\legend{bitstreams,nrgrep,gre2p,Annot}
451\end{axis}
452\end{tikzpicture}
453
454\end{frame}
455
456\begin{frame}[fragile]
457\frametitle{IPC}
458
459\begin{tikzpicture}
460\begin{axis}[
461xtick=data,
462ylabel=Instructions per Cycle,
463xticklabels={@,Date,Email,URI,Hex,StarHeight},
464tick label style={font=\tiny},
465enlarge x limits=0.15,
466%enlarge y limits={0.15, upper},
467ymin=0,
468legend style={at={(0.5,0.9)},
469anchor=north,legend columns=-1},
470ybar,
471bar width=7pt,
472]
474file {../data/ipc-bitstreams.dat};
476file {../data/ipc-nrgrep112.dat};
478file {../data/ipc-gre2p.dat};
479
480\legend{bitstreams,nrgrep,gre2p,Annot}
481\end{axis}
482\end{tikzpicture}
483\end{frame}
484
485
486\begin{frame}[fragile]
487\frametitle{SIMD Scalability}
488\begin{tikzpicture}
489\begin{axis}[
490xtick=data,
491ylabel=AVX2 Instruction Reduction,
492xticklabels={@,Date,Email,URI,Hex,StarHeight},
493tick label style={font=\tiny},
494enlarge x limits=0.15,
495%enlarge y limits={0.15, upper},
496ymin=0,
497legend style={at={(0.5,0.15)},
498anchor=north,legend columns=-1},
499ybar,
500bar width=7pt,
501]
503file {../data/sse2-avx2-instr-red-bitstreams.dat};
505file {../data/sse2-avx2-instr-red-nrgrep112.dat};
507file {../data/sse2-avx2-instr-red-gre2p.dat};
508
509\legend{bitstreams,nrgrep,gre2p,Annot}
510\end{axis}
511\end{tikzpicture}
512\end{frame}
513
514
515\begin{frame}[fragile]
516\frametitle{Speedups Achieved}
517
518\begin{center}
519\begin{tabular}{|c|c|c|c|} \hline
520\multirow{2}{*}{Expression} & \multicolumn{3}{c|}{Bitstream/AVX2 grep Speedup} \\ \cline{2-4}
521& vs. nrgrep & vs. gre2p & vs. GNU egrep \\ \hline \hline
522At & 3.5X & 34X & 1.6X\\ \hline
523Date & 0.76X & 13X & 48X\\ \hline
524Email & 9.5X & 28X & 12X\\ \hline
525URI & 6.6X & 27X & 518X\\ \hline
526Hex & 8.1X & 105X & 267X\\ \hline
527StarHeight & 1.9X & 7.6X & 97X\\ \hline
528\end{tabular}
529\end{center}
530\end{frame}
531
532
533\begin{frame}[fragile]
534\frametitle{GPU Performance}
535\begin{tikzpicture}
536\begin{axis}[
537xtick=data,
538ylabel=Running Time (ms per megabyte),
539xticklabels={@,Date,Email,URI,Hex,StarHeight},
540tick label style={font=\tiny},
541enlarge x limits=0.15,
542%enlarge y limits={0.15, upper},
543ymin=0,
544legend style={at={(0.5,0.8)},
545anchor=north,legend columns=-1},
546ybar,
547bar width=7pt,
548]
550file {../data/ssetime.dat};
552file {../data/avxtime.dat};
554file {../data/gputime.dat};
555
556\legend{SSE2,AVX2,GPU,Annot}
557\end{axis}
558\end{tikzpicture}
559
560
561\end{frame}
562
563\section{Conclusion}
564\begin{frame}[fragile]
565\frametitle{Results}
566\begin{itemize}%[<+->]
567\item A new class of parallel regular expression algorithms has been introduced
568based on the concept of bitwise data parallelism and MatchStar.
569\item Single core acceleration over sequential implementations can be dramatic.
570\item A long-stream addition technique has been developed to allow MatchStar to
571scale directly with SIMD instruction width.
572\item Perfect scaling in instruction count was observed with 256-bit AVX2
573technology versus 128-bit SIMD technology except for nested repetition.
574\item GPU implementations show promise, but need additional work.
575\end{itemize}
576\end{frame}
577
578\begin{frame}[fragile]
579\frametitle{Ongoing/Future Work}
580
581\begin{itemize}%[<+->]
582\item The prototype technologies have now been re-implemented in
583a single C++ executable combining 4 compilers.
584\begin{itemize}
585\item CCC: Character class compiler
586\item RE\_compile: regular expression compiler
587\item Pablo: Block-at-a-time compiler
588\item LLVM: Fully dynamic code generation.
589\end{itemize}
590\item Compilation overhead is high, but tolerable for large files.
591