# source:docs/Working/re/Slides/pactslides.tex@4081

Last change on this file since 4081 was 4081, checked in by cameron, 5 years ago

Presentation slides

File size: 14.7 KB
Line
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{color}
4\usepackage[latin1]{inputenc}
5\usepackage{verbatim}
6\usepackage{colortbl}
7\usepackage{graphicx}
8\usepackage{tikz}
9\usepackage{multirow}
10\usepackage{pgfplots}
11
12\usetheme{Warsaw}
14
15\title[PACT 2014: Parabix grep]{Bitwise Data Parallelism in Regular Expression Matching}
16\author[Cameron, Shermer, Shriraman, Herdy, Lin, Hull, and Lin]{Rob Cameron, Tom Shermer, Arrvindh Shriraman, Ken Herdy, Dan Lin, Ben Hull, Meng Lin}
17\institute[CS@SFU]{School of Computing Science\\
18Simon Fraser University}
19\date{August 25, 2014}
20
21\begin{document}
22
23\begin{frame}
24\titlepage
25\end{frame}
26
27
28\begin{frame}
29\frametitle{Outline}
30\tableofcontents%[pausesections]
31\end{frame}
32
33\section{Introduction}
34
35\begin{frame}[fragile]
36\frametitle{Acceleration of Regular Expression Matching}
37
38\begin{itemize}%[<+->]
39\item Example: quickly find instances of \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]): in text. 40\item Sequential algorithms use finite automata or backtracking. 41\item Parallelizing these approaches is difficult. 42\begin{itemize} 43\item Finite state machines are the 13th (and hardest) dwarf'' in the Berkeley Landscape of Parallel Computing Research. 44\item Embarassingly sequential? 45\item Some success in parallel application of FSMs to multiple input streams. 46\item Recent work shows some promise using techniques such as coalesced FSMs and principled speculation. 47\end{itemize} 48\end{itemize} 49\end{frame} 50 51 52\begin{frame}[fragile] 53\frametitle{Our Approach} 54 55\begin{itemize}[<+->] 56\item A new algorithm family for regular expression matching based 57on bitwise data parallelism. 58 59\item A fundamentally parallel approach in contrast to approaches 60that parallelize existing DFA or NFA algorithms. 61 62\item Builds on the Parabix methods that have been used for XML 63parsing and Unicode transcoding. 64 65\item Uses bitstream addition for simultaneous nondeterministic matching 66of character class repetitions (MatchStar primitive). 67 68\item Compilation technologies for regular expressions (new), character classes (existing), 69unbounded bitstreams (existing). 70 71\item Recent work: all compilers integrated together with LLVM for 72fully dynamic regular expression matching. 73\end{itemize} 74\end{frame} 75 76\begin{frame}[fragile] 77\frametitle{Bitwise Data Parallelism} 78 79\begin{itemize}%[<+->] 80\item Parabix methods use a transform representation of text. 81\item Bitstreams are formed using one bit per input byte. 82\item Eight basis bit streams are defined for bit 0, bit 1, ... bit 7 of each byte. 83\item Perform bitwise processing with wide SIMD registers. 84\begin{itemize} 85\item Process 128 bytes at a time with SSE2, Neon, Altivec. 86\item Process 256 bytes at a time with AVX2. 87\end{itemize} 88\item Transposition supported efficiently with SIMD pack operations. 89\end{itemize} 90 91\end{frame} 92 93\begin{frame}[fragile] 94\frametitle{Impressive Results in Full Unicode Matching} 95 96\begin{itemize}%[<+->] 97\item Find capitalized words at ends of sentences. 98\item Use Unicode upper/lower case categories. 99\item Match \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]): against 110 MB Arabic file.
100\item \verb:pcregrep: 14,772,797,548 CPU cycles.
101\item \verb:egrep: 45,951,194,784 CPU cycles.
102\item \verb:icgrep: (Parabix) 653,530,064 CPU cycles.
103\item 20X acceleration over \verb:pcgregrep:, 70X over GNU \verb:egrep:.
104\end{itemize}
105\end{frame}
106
107
108\section{Parabix Technology}
109
110\begin{frame}
111\frametitle{Parallel Bit Streams: A Transform Representation of Text}
112\begin{itemize}
113\item Given a byte-oriented character stream $T$, e.g., {\tt Ab17;}''.
114\item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$.
115\item Each stream $b_k$ comprises bit $k$ of each byte of $T$.
116\end{itemize}
117\begin{tabular}{|c|c|c|c|c|c|} \hline
118$T$ & A & b & 1 & 7 & ; \\ \hline
120$b_0$ & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} \pause \\ \hline
121$b_1$ & \alert<3>{1} & \alert<3>{1} & \alert<3>{0} & \alert<3>{0} & \alert<3>{0} \pause \\ \hline
122$b_2$ & \alert<4>{0} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1}  \\ \hline
123$b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline
124$b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline
125$b_5$ & 0 & 0 & 0 & 1 & 0 \\ \hline
126$b_6$ & 0 & 1 & 0 & 1 & 1 \\ \hline
127$b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline
128\end{tabular}
129\end{frame}
130
131\begin{frame}
132\frametitle{Parabix Programming}
133
134\begin{itemize}[<+->]
135\item Parabix programs written as unbounded bit stream operations.
136\item Unbounded bit streams considered as arbitrarily large integers.
137\item Fundamental operations: bitwise logic, bit-stream shifting and long-stream addition.
138\item Parabix tool chain has three components:
139\begin{itemize}
140\item Character Class Compiler (CCC) produces stream equations from character classes.
141\item Parallel Block Compiler (Pablo) converts unbounded stream programs to C++/SIMD.
142\item Portable SIMD library for multiple architectures.
143\end{itemize}
144\end{itemize}
145\end{frame}
146
147\begin{frame}[fragile]
148\frametitle{Character Class Formation}
149\begin{itemize}[<+->]
150\item Combining 8 bits of a code unit gives a character class stream.
151\item CCC({\tt cc\_a = [a]})
152\item
153\begin{semiverbatim}
154temp1 = (bit[1] &~ bit[0])
155temp2 = (bit[2] &~ bit[3])
156temp3 = (temp1 & temp2)
157temp4 = (bit[4] | bit[5])
158temp5 = (bit[7] &~ bit[6])
159temp6 = (temp5 &~ temp4)
160cc\_a = (temp3 & temp6)
161\end{semiverbatim}
162\end{itemize}
163
164\end{frame}
165
166\begin{frame}[fragile]
167\frametitle{Character Class Ranges}
168\begin{itemize}[<+->]
169\item Ranges of characters are often very simple to compute.
170\item CCC({\tt cc\_0\_9 = [0-9]})
171\item
172\begin{semiverbatim}
173temp7 = (bit[0] | bit[1])
174temp8 = (bit[2] & bit[3])
175temp9 = (temp8 &~ temp7)
176temp10 = (bit[5] | bit[6])
177temp11 = (bit[4] & temp10)
178cc\_0\_9 = (temp9 &~ temp11)
179\end{semiverbatim}
180\end{itemize}
181\end{frame}
182
183
184\begin{frame}[fragile]
185\frametitle{Character Class Common Subexpressions}
186\begin{itemize}
187\item Multiple definitions use common subexpressions.
188\item CCC({\tt cc\_z9 = [z9]})
189\item
190\begin{semiverbatim}
191temp12 = (bit[4] &~ bit[5])
192temp13 = (temp12 & temp5)
193temp14 = (temp9 & temp13)
194temp15 = (temp1 & temp8)
195temp16 = (bit[6] &~ bit[7])
196temp17 = (temp12 & temp16)
197temp18 = (temp15 & temp17)
198cc\_z9 = (temp14 | temp18)
199\end{semiverbatim}
200\end{itemize}
201\end{frame}
202
203\section{Regular Expression Matching with Parabix}
204
205\begin{frame}[fragile]
206\frametitle{Marker Streams}
207\begin{itemize}[<+->]
208\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
209\item Each marker stream $M_i$ has one bit for every input byte in the input file.
210\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
211including item $i$ in the expression occurs at position $j-1$ in the input stream.
212\item Conceptually, marker streams are computed in parallel for all positions in the file
213at once (bitwise data parallelism).
214\item In practice, marker streams are computed block-by-block, where the
215block size is the size of a SIMD register in bits.
216\end{itemize}
217\end{frame}
218
219\begin{frame}[fragile]
220\frametitle{Marker Stream Example}
221\begin{itemize}[<+->]
222\item Consider matching regular expression \verba[0-9]*[z9] against the input text below.
223\item<2->$M_1$ marks positions after occurrences of \verb:a:.
224\item<3->$M_2$ marks positions after occurrences of \verb:a[0-9]*:.
225\item<4->$M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
226\end{itemize}
227
228\begin{tabular}{cr}\\
229input data  & \verba453z--b3z--az--a12949z--ca22z7--\\
230\onslide<2->$M_1$ & \verb.1...........1...1.........1.....\\
231\onslide<3->$M_2$ & \verb.1111........1...111111....111...\\
232\onslide<4->$M_3$ & \verb.....1........1.....1.11......1..
233\end{tabular}
234\end{frame}
235
236\begin{frame}[fragile]
237\frametitle{Matching Character Class Repetitions with MatchStar}
238\begin{itemize}[<+->]
239\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C) \oplus C) \vee M$
240\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
241\item<4->Use addition to scan each marker through the class.
242\item<5->Bits that change represent matches.
243\item<6->We also have matches at start positions in $M_1$.
244\end{itemize}
245
246\begin{tabular}{cr}\\
247\onslide<3->input data  & \verba453z--b3z--az--a12949z--ca22z7--\\
248\onslide<3->$M_1$ & \verb.1...........1...1.........1.....\\
249\onslide<3->$C = \text{\tt [0-9]}$ & \verb.111....1........11111.....11.1..\\
250\onslide<4->$T_0 = M_1 \wedge C$ & \verb.1...............1.........1.....\\
251\onslide<4->$T_1 = T_0 + C$ & \verb....1...1.............1......11..\\
252\onslide<5->$T_2 = T_1 \oplus C$ & \verb.1111............111111....111...\\
253\onslide<6->$M_2 = T_2 \vee M_1$ & \verb.1111........1...111111....111...
254\end{tabular}
255\end{frame}
256
257
258\begin{frame}[fragile]
259\frametitle{Regular Expression Compilation}
260\begin{itemize}
261\item Our regular expression compiler produces unbounded Pablo code.
262\item RE\_compile(\verba[0-9]*[z9])
263\item
264\begin{semiverbatim}
265m0 = ~0
267m2 = pablo.MatchStar(m1, cc\_0\_9)
269\end{semiverbatim}
270\end{itemize}
271\end{frame}
272
273\begin{frame}[fragile]
274\frametitle{Alternations and Optional Terms}
275\begin{itemize}
276\item Most RE features are handled naturally.
277\item RE\_compile(\verba(b?|cd))
278\item
279\begin{semiverbatim}
280m0 = ~0
282m2 = pablo.MatchStar(m1, cc\_b)
283m3 = m1 | m2      # b is optional
286m6 = m3 | m5      # two alternatives
287\end{semiverbatim}
288\end{itemize}
289\end{frame}
290
291
292\begin{frame}[fragile]
293\frametitle{Nested Repetitions Use While Loops}
294\begin{itemize}
295\item While loops are used for complex or nested repetitions.
296\item RE\_compile(\verb(a[0-9]*[z9])*)
297\item
298\begin{semiverbatim}
299m0 = ~0
300t = m0   # while test variable
301a = m0   # while result accumulator
302while t:
303   m1 = pablo.Advance(t & cc\_a)
304   m2 = pablo.MatchStar(m1, cc\_0\_9)
306   t = m3 &~ a   # iterate only for new matches
307   a = a | m3
308\end{semiverbatim}
309\end{itemize}
310\end{frame}
311
312
313
314\section{Performance}
315
316
317\begin{frame}[fragile]
318\frametitle{Test Expressions}
319{\tiny
320\begin{tabular}{|l|l|}
321\hline
322Name            & Expression    \\ \hline
323@               & \verb@              \\ \hline
324Date            & \verb([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)          \\ \hline
325Email           & \verb([^ @]+)@([^ @]+)              \\ \hline
326URI     & \verb(([a-zA-Z][a-zA-Z0-9]*)://|mailto:)([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)         \\ \hline
327Hex             & \verb[ ](0x)?([a-fA-F0-9][a-fA-F0-9])+[.:,?! ]              \\ \hline
328StarHeight              & \verb[A-Z]((([a-zA-Z]*a[a-zA-Z]*[ ])*[a-zA-Z]*e[a-zA-Z]*[ ])*[a-zA-Z]*s[a-zA-Z]*[ ])*[.?!]          \\ \hline
329\end{tabular}
330}
331\end{frame}
332
333
334\begin{frame}[fragile]
335\frametitle{SSE2 Performance}
336\begin{tikzpicture}
337\begin{axis}[
338xtick=data,
339ylabel=Cycles per Byte,
340xticklabels={@,Date,Email,URI,Hex,StarHeight},
341tick label style={font=\tiny},
342enlarge x limits=0.15,
343%enlarge y limits={0.15, upper},
344ymin=0,
345legend style={at={(0.5,0.9)},
346anchor=north,legend columns=-1},
347ymax=44,
348ybar,
349bar width=7pt,
350%visualization depends on=y \as \rawy,
351]
353file {../data/cycles-bitstreams.dat};
355file {../data/cycles-nrgrep112.dat};
357file {../data/cycles-gre2p.dat};
358
359\legend{bitstreams,nrgrep,gre2p,Annot}
360\end{axis}
361\end{tikzpicture}
362
363\end{frame}
364
365\begin{frame}[fragile]
366\frametitle{IPC}
367
368\begin{tikzpicture}
369\begin{axis}[
370xtick=data,
371ylabel=Instructions per Cycle,
372xticklabels={@,Date,Email,URI,Hex,StarHeight},
373tick label style={font=\tiny},
374enlarge x limits=0.15,
375%enlarge y limits={0.15, upper},
376ymin=0,
377legend style={at={(0.5,0.9)},
378anchor=north,legend columns=-1},
379ybar,
380bar width=7pt,
381]
383file {../data/ipc-bitstreams.dat};
385file {../data/ipc-nrgrep112.dat};
387file {../data/ipc-gre2p.dat};
388
389\legend{bitstreams,nrgrep,gre2p,Annot}
390\end{axis}
391\end{tikzpicture}
392\end{frame}
393
394
395\begin{frame}[fragile]
396\frametitle{SIMD Scalability}
397\begin{tikzpicture}
398\begin{axis}[
399xtick=data,
400ylabel=AVX2 Instruction Reduction,
401xticklabels={@,Date,Email,URI,Hex,StarHeight},
402tick label style={font=\tiny},
403enlarge x limits=0.15,
404%enlarge y limits={0.15, upper},
405ymin=0,
406legend style={at={(0.5,0.15)},
407anchor=north,legend columns=-1},
408ybar,
409bar width=7pt,
410]
412file {../data/sse2-avx2-instr-red-bitstreams.dat};
414file {../data/sse2-avx2-instr-red-nrgrep112.dat};
416file {../data/sse2-avx2-instr-red-gre2p.dat};
417
418\legend{bitstreams,nrgrep,gre2p,Annot}
419\end{axis}
420\end{tikzpicture}
421\end{frame}
422
423
424\begin{frame}[fragile]
425\frametitle{Speedups Achieved}
426
427\begin{center}
428\begin{tabular}{|c|c|c|c|} \hline
429\multirow{2}{*}{Expression} & \multicolumn{3}{c|}{Bitstream/AVX2 grep Speedup} \\ \cline{2-4}
430& vs. nrgrep & vs. gre2p & vs. GNU grep -e\\ \hline \hline
431At & 3.5X & 34X & 1.6X\\ \hline
432Date & 0.76X & 13X & 48X\\ \hline
433Email & 9.5X & 28X & 12X\\ \hline
434URI & 6.6X & 27X & 518X\\ \hline
435Hex & 8.1X & 105X & 267X\\ \hline
436StarHeight & 1.9X & 7.6X & 97X\\ \hline
437\end{tabular}
438\end{center}
439\end{frame}
440
441
442\begin{frame}[fragile]
443\frametitle{GPU Performance}
444\begin{tikzpicture}
445\begin{axis}[
446xtick=data,
447ylabel=Running Time (ms per megabyte),
448xticklabels={@,Date,Email,URI,Hex,StarHeight},
449tick label style={font=\tiny},
450enlarge x limits=0.15,
451%enlarge y limits={0.15, upper},
452ymin=0,
453legend style={at={(0.5,0.8)},
454anchor=north,legend columns=-1},
455ybar,
456bar width=7pt,
457]
459file {../data/ssetime.dat};
461file {../data/avxtime.dat};
463file {../data/gputime.dat};
464
465\legend{SSE2,AVX2,GPU,Annot}
466\end{axis}
467\end{tikzpicture}
468
469
470\end{frame}
471
472\section{Conclusion}
473\begin{frame}[fragile]
474\frametitle{Results}
475\begin{itemize}%[<+->]
476\item A new class of parallel regular expression algorithms has been introduced
477based on the concept of bitwise data parallelism and MatchStar.
478\item Single core acceleration over sequential implementations can be dramatic.
479\item A long-stream addition technique has been developed to allow MatchStar to
480scale directly with SIMD instruction width.
481\item Perfect scaling in instruction count was observed with 256-bit AVX2
482technology versus 128-bit SIMD technology except for nested repetition.
483\item GPU implementations show promise, but need additional work.
484\end{itemize}
485\end{frame}
486
487\begin{frame}[fragile]
488\frametitle{Ongoing/Future Work}
489
490\begin{itemize}%[<+->]
491\item The prototype technologies have now been re-implemented in
492a single C++ executable combining 4 compilers.
493\begin{itemize}
494\item CCC: Character class compiler
495\item RE\_compile: regular expression compiler
496\item Pablo: Block-at-a-time compiler
497\item LLVM: Fully dynamic code generation.
498\end{itemize}
499\item Compilation overhead is high, but tolerable for large files.
500