source: docs/Working/re/Slides/pactslides.tex @ 4082

Last change on this file since 4082 was 4082, checked in by cameron, 5 years ago

Updates

File size: 17.1 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{color}
4\usepackage[latin1]{inputenc}
5\usepackage{verbatim}
6\usepackage{colortbl}
7\usepackage{graphicx}
8\usepackage{tikz}
9\usepackage{multirow}
10\usepackage{pgfplots}
11
12\usetheme{Warsaw}
13\setbeamertemplate{navigation symbols}{}
14
15\title[PACT 2014: Parabix grep]{Bitwise Data Parallelism in Regular Expression Matching}
16\author[Cameron, Shermer, Shriraman, Herdy, Lin, Hull, and Lin]{Rob Cameron, Tom Shermer, Arrvindh Shriraman, Ken Herdy, Dan Lin, Ben Hull, Meng Lin}
17\institute[CS@SFU]{School of Computing Science\\
18Simon Fraser University}
19\date{August 25, 2014}
20
21\begin{document}
22
23\begin{frame}
24\titlepage
25\end{frame}
26
27
28\begin{frame}
29\frametitle{Outline}
30\tableofcontents%[pausesections]
31\end{frame}
32
33\section{Introduction}
34
35\begin{frame}[fragile]
36\frametitle{Acceleration of Regular Expression Matching}
37
38\begin{itemize}[<+->]
39\item Example: quickly find instances of \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]): in text.
40\item Sequential algorithms use finite automata or backtracking.
41\item Parallelizing these approaches is difficult.
42\begin{itemize}
43\item Finite state machines are the 13th (and hardest) ``dwarf'' in the Berkeley Landscape of Parallel Computing Research.
44\item Embarassingly sequential?
45\item Some success in parallel application of FSMs to multiple input streams.
46\item Recent work shows some promise using techniques such as coalesced FSMs and principled speculation.
47\end{itemize}
48\end{itemize}
49\end{frame}
50
51
52\begin{frame}[fragile]
53\frametitle{Our Approach}
54
55\begin{itemize}[<+->]
56\item A fundamentally parallel approach in contrast to approaches
57that parallelize existing DFA or NFA algorithms.
58
59\item A new algorithm {\em family} for regular expression matching based
60on bitwise data parallelism.
61
62\item Builds on the Parabix methods that have been used for XML
63parsing and Unicode transcoding.
64
65\item Introduces the MatchStar primitive
66for simultaneous nondeterministic matching
67of character class repetitions using bitstream addition.
68
69\item Compilation technologies for regular expressions (new), character classes (existing),
70unbounded bitstreams (existing).
71
72\item Recent work: all compilers integrated together with LLVM for
73fully dynamic regular expression matching.
74\end{itemize}
75\end{frame}
76
77\begin{frame}[fragile]
78\frametitle{Bitwise Data Parallelism}
79
80\begin{itemize}[<+->]
81\item Parabix methods use a transform representation of text.
82\item Bitstreams are formed using one bit per input byte.
83\item Eight basis bit streams are defined for bit 0, bit 1, ... bit 7 of each byte.
84\item Perform bitwise processing with wide SIMD registers.
85\begin{itemize}
86\item Process 128 bytes at a time with SSE2, Neon, Altivec.
87\item Process 256 bytes at a time with AVX2.
88\end{itemize}
89\item Parabix transform is fast using SIMD pack operations.
90\item {\em But}: programmer view is of unbounded, arbitrary length bitstreams.
91\end{itemize}
92
93\end{frame}
94
95\begin{frame}[fragile]
96\frametitle{Impressive Results in Full Unicode Matching}
97
98\begin{itemize}[<+->]
99\item \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]):
100\begin{itemize}
101\item Match against 110 MB Arabic wikimedia document.
102\item Find capitalized words at ends of sentences.
103\item Use Unicode upper/lower case categories.
104\end{itemize}
105\item \verb:egrep: 45,951,194,784 CPU cycles.
106\item \verb:pcregrep: 14,772,797,548 CPU cycles.
107\item \verb:icgrep: (Parabix) 653,530,064 CPU cycles.
108\item 20X acceleration over \verb:pcgregrep:, 70X over GNU \verb:egrep:.
109\end{itemize}
110\end{frame}
111
112
113\section{Parabix Technology}
114
115\begin{frame}
116\frametitle{Parallel Bit Streams: A Transform Representation of Text}
117\begin{itemize}
118\item Given a byte-oriented character stream $T$, e.g., ``{\tt Ab17;}''.
119\item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$.
120\item Each stream $b_k$ comprises bit $k$ of each byte of $T$.
121\end{itemize}
122\begin{tabular}{|c|c|c|c|c|c|} \hline
123$T$ & A & b & 1 & 7 & ; \\ \hline
124ASCII & \alert<2>{0}\alert<3>{1}\alert<4>{0}00001 & \alert<2>{0}\alert<3>{1}\alert<4>{1}00010 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10001 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10111 & \alert<2>{0}\alert<3>{0}\alert<4>{1}11011 \pause \\ \hline
125$b_0$ & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} \pause \\ \hline
126$b_1$ & \alert<3>{1} & \alert<3>{1} & \alert<3>{0} & \alert<3>{0} & \alert<3>{0} \pause \\ \hline
127$b_2$ & \alert<4>{0} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1}  \\ \hline
128$b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline
129$b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline
130$b_5$ & 0 & 0 & 0 & 1 & 0 \\ \hline
131$b_6$ & 0 & 1 & 0 & 1 & 1 \\ \hline
132$b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline
133\end{tabular}
134\end{frame}
135
136\begin{frame}
137\frametitle{Parabix Programming}
138
139\begin{itemize}[<+->]
140\item Parabix programs written as unbounded bit stream operations.
141\item Unbounded bit streams considered as arbitrarily large integers.
142\item Fundamental operations: bitwise logic, bit-stream shifting and long-stream addition.
143\item Parabix tool chain has three components:
144\begin{itemize}
145\item Character Class Compiler (CCC) produces stream equations from character classes.
146\item Parallel Block Compiler (Pablo) converts unbounded stream programs to C++/SIMD.
147\item Portable SIMD library for multiple architectures.
148\end{itemize}
149\end{itemize}
150\end{frame}
151
152\begin{frame}[fragile]
153\frametitle{Character Class Formation}
154\begin{itemize}[<+->]
155\item Combining 8 bits of a code unit gives a character class stream.
156\item CCC({\tt cc\_a = [a]})
157\item 
158\begin{semiverbatim}
159temp1 = (bit[1] &~ bit[0])
160temp2 = (bit[2] &~ bit[3])
161temp3 = (temp1 & temp2)
162temp4 = (bit[4] | bit[5])
163temp5 = (bit[7] &~ bit[6])
164temp6 = (temp5 &~ temp4)
165cc\_a = (temp3 & temp6)
166\end{semiverbatim}
167\end{itemize}
168
169\end{frame}
170
171\begin{frame}[fragile]
172\frametitle{Character Class Ranges}
173\begin{itemize}[<+->]
174\item Ranges of characters are often very simple to compute.
175\item CCC({\tt cc\_0\_9 = [0-9]})
176\item 
177\begin{semiverbatim}
178temp7 = (bit[0] | bit[1])
179temp8 = (bit[2] & bit[3])
180temp9 = (temp8 &~ temp7)
181temp10 = (bit[5] | bit[6])
182temp11 = (bit[4] & temp10)
183cc\_0\_9 = (temp9 &~ temp11)
184\end{semiverbatim}
185\end{itemize}
186\end{frame}
187
188
189\begin{frame}[fragile]
190\frametitle{Character Class Common Subexpressions}
191\begin{itemize}
192\item Multiple definitions use common subexpressions.
193\item CCC({\tt cc\_z9 = [z9]})
194\item 
195\begin{semiverbatim}
196temp12 = (bit[4] &~ bit[5])
197temp13 = (temp12 & temp5)
198temp14 = (temp9 & temp13)
199temp15 = (temp1 & temp8)
200temp16 = (bit[6] &~ bit[7])
201temp17 = (temp12 & temp16)
202temp18 = (temp15 & temp17)
203cc\_z9 = (temp14 | temp18)
204\end{semiverbatim}
205\end{itemize}
206\end{frame}
207
208\section{Regular Expression Matching with Parabix}
209
210\begin{frame}[fragile]
211\frametitle{Marker Streams}
212\begin{itemize}[<+->]
213\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
214\item Each marker stream $M_i$ has one bit for every input byte in the input file.
215\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
216including item $i$ in the expression occurs at position $j-1$ in the input stream.
217\item Conceptually, marker streams are computed in parallel for all positions in the file
218at once (bitwise data parallelism).
219\item In practice, marker streams are computed block-by-block, where the
220block size is the size of a SIMD register in bits.
221\end{itemize}
222\end{frame}
223
224\begin{frame}[fragile]
225\frametitle{Marker Stream Example}
226\begin{itemize}[<+->]
227\item Consider matching regular expression \verb`a[0-9]*[z9]` against the input text below.
228\item<2->$M_1$ marks positions after occurrences of \verb:a:.
229\item<3->$M_2$ marks positions after occurrences of \verb:a[0-9]*:.
230\item<4->$M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
231\end{itemize}
232
233\begin{tabular}{cr}\\
234input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
235\onslide<2->$M_1$ & \verb`.1...........1...1.........1.....`\\
236\onslide<3->$M_2$ & \verb`.1111........1...111111....111...`\\
237\onslide<4->$M_3$ & \verb`.....1........1.....1.11......1..`
238\end{tabular}
239\end{frame}
240
241\begin{frame}[fragile]
242\frametitle{Matching Character Class Repetitions with MatchStar}
243\begin{itemize}[<+->]
244\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C)  \oplus C) \vee M$
245\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
246\item<4->Use addition to scan each marker through the class.
247\item<5->Bits that change represent matches.
248\item<6->We also have matches at start positions in $M_1$.
249\end{itemize}
250
251\begin{tabular}{cr}\\
252\onslide<3->input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
253\onslide<3->$M_1$ & \verb`.1...........1...1.........1.....`\\
254\onslide<3->$C = \text{\tt [0-9]}$ & \verb`.111....1........11111.....11.1..`\\
255\onslide<4->$T_0 = M_1 \wedge C$ & \verb`.1...............1.........1.....`\\
256\onslide<4->$T_1 = T_0 + C$ & \verb`....1...1.............1......11..`\\
257\onslide<5->$T_2 = T_1 \oplus C$ & \verb`.1111............111111....111...`\\
258\onslide<6->$M_2 = T_2 \vee M_1$ & \verb`.1111........1...111111....111...`
259\end{tabular}
260\end{frame}
261
262
263\begin{frame}[fragile]
264\frametitle{Regular Expression Compilation}
265\begin{itemize}
266\item Our regular expression compiler produces unbounded Pablo code.
267\item RE\_compile(\verb`a[0-9]*[z9]`)
268\item
269\begin{semiverbatim}
270m0 = ~0
271m1 = pablo.Advance(m0 & cc\_a)
272m2 = pablo.MatchStar(m1, cc\_0\_9)
273m3 = pablo.Advance(m2, cc\_z9)
274\end{semiverbatim}
275\end{itemize}
276\end{frame}
277
278\begin{frame}[fragile]
279\frametitle{Alternations and Optional Terms}
280\begin{itemize}
281\item Most RE features are handled naturally.
282\item RE\_compile(\verb`a(b?|cd)`)
283\item
284\begin{semiverbatim}
285m0 = ~0
286m1 = pablo.Advance(m0 & cc\_a)
287m2 = pablo.MatchStar(m1, cc\_b)
288m3 = m1 | m2      # b is optional
289m4 = pablo.Advance(m1, cc\_c) 
290m5 = pablo.Advance(m2, cc\_d)
291m6 = m3 | m5      # two alternatives
292\end{semiverbatim}
293\end{itemize}
294\end{frame}
295
296
297\begin{frame}[fragile]
298\frametitle{Nested Repetitions Use While Loops}
299\begin{itemize}
300\item While loops are used for complex or nested repetitions.
301\item RE\_compile(\verb`(a[0-9]*[z9])*`)
302\item
303\begin{semiverbatim}
304m0 = ~0
305t = m0   # while test variable
306a = m0   # while result accumulator
307while t:
308   m1 = pablo.Advance(t & cc\_a)
309   m2 = pablo.MatchStar(m1, cc\_0\_9)
310   m3 = pablo.Advance(m2, cc\_z9)
311   t = m3 &~ a   # iterate only for new matches
312   a = a | m3
313\end{semiverbatim}
314\end{itemize}
315\end{frame}
316
317\section{Block-by-Block Processing}
318
319\begin{frame}[fragile]
320\frametitle{Pablo Compiler Compiles to Block-by-Block Form}
321\begin{block}<+->{Pablo Input}
322\begin{semiverbatim}
323m0 = ~0
324m1 = pablo.Advance(m0 & cc\_a)
325m2 = pablo.MatchStar(m1, cc\_0\_9)
326m3 = pablo.Advance(m2, cc\_z9)
327\end{semiverbatim}
328
329\end{block}
330\begin{block}<+->{Pablo Output Adds Carry Processing}
331\begin{semiverbatim}
332m0 = simd_not(simd<1>::constant<0>());
333cryQ.set(0, blkShl(blkAnd(m0, c\c_a), cryQ.get(0), m1);
334cryQ.set(1, blkMatchStar(m1, cc\_0\_9, cryQ.get(1), m2);
335cryQ.set(2, blkShl(blkAnd(m2, cc\_z9), cryQ.get(2), m3);
336cryQ.AdjustCarries(3);
337\end{semiverbatim}
338\end{block}
339\end{frame}
340
341
342\begin{frame}[fragile]
343\frametitle{Long-Stream Addition}
344Compute the long-stream sum {\tt Z} = {\tt X} + {\tt Y}.
345\begin{tabular}{c||r|r|r|r|r|r|r|r||}\cline{2-9}
346{\tt X} & {\tt 19} & {\tt 31} & {\tt BA} & {\tt 4C} & {\tt 3D} & {\tt 45} & {\tt 21} & {\tt F1} \\ \cline{2-9}
347{\tt Y} & {\tt 22} & {\tt 12} & {\tt 45} & {\tt B3} & {\tt E2} & {\tt 16} & {\tt 17} & {\tt 36} \\ \cline{2-9}
348\onslide<2->{\tt R} (add digits)& {\tt 3B} & {\tt 43} & {\tt FF} & {\tt FF} & {\tt 1F} & {\tt 5B} & {\tt 38} & {\tt 27} \\ \cline{2-9}
349\onslide<2->{\tt x} = high\_bit({\tt X})& {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} \\ \cline{2-9}
350\onslide<2->{\tt y} = high\_bit({\tt Y}) & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
351\onslide<2->{\tt r} = high\_bit({\tt Z}) & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
352\onslide<3->{\tt c} (carries)& {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 1} \\ \cline{2-9}
353\onslide<4->{\tt c*2} (shift left 1)& {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} \\ \cline{2-9}
354\onslide<5->{\tt b} (bubble ({\tt Z} = {\tt FF}) & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
355\onslide<6->{\tt i} = Matchstar({\tt b},{\tt c*2})& {\tt 0} & {\tt 1} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} \\ \cline{2-9}
356\onslide<7->{\tt Z} = {\tt R} + {\tt i} & {\tt 3B} & {\tt 44} & {\tt 0} & {\tt 0} & {\tt 1F} & {\tt 5B} & {\tt 39} & {\tt 27} \\ \cline{2-9}
357\end{tabular}
358\end{frame}
359
360
361\section{Performance}
362
363
364\begin{frame}[fragile]
365\frametitle{Test Expressions}
366
367
368{\tiny
369\begin{tabular}{|l|l|}
370\hline
371Name            & Expression    \\ \hline   
372@               & \verb`@`              \\ \hline     
373Date            & \verb`([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)`          \\ \hline     
374Email           & \verb`([^ @]+)@([^ @]+)`              \\ \hline
375URI     & \verb`(([a-zA-Z][a-zA-Z0-9]*)://|mailto:)([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)`         \\ \hline     
376Hex             & \verb`[ ](0x)?([a-fA-F0-9][a-fA-F0-9])+[.:,?! ]`              \\ \hline
377StarHeight              & \verb`[A-Z]((([a-zA-Z]*a[a-zA-Z]*[ ])*[a-zA-Z]*e[a-zA-Z]*[ ])*[a-zA-Z]*s[a-zA-Z]*[ ])*[.?!]`          \\ \hline
378\end{tabular}
379}
380\end{frame}
381
382
383\begin{frame}[fragile]
384\frametitle{SSE2 Performance}
385\begin{tikzpicture}
386\begin{axis}[
387xtick=data,
388ylabel=Cycles per Byte,
389xticklabels={@,Date,Email,URI,Hex,StarHeight},
390tick label style={font=\tiny},
391enlarge x limits=0.15,
392%enlarge y limits={0.15, upper},
393ymin=0,
394legend style={at={(0.5,0.9)},
395anchor=north,legend columns=-1},
396ymax=44,
397ybar,
398bar width=7pt,
399%visualization depends on=y \as \rawy,
400]
401\addplot
402file {../data/cycles-bitstreams.dat};
403\addplot
404file {../data/cycles-nrgrep112.dat};
405\addplot
406file {../data/cycles-gre2p.dat};
407 
408\legend{bitstreams,nrgrep,gre2p,Annot}
409\end{axis}
410\end{tikzpicture}
411
412\end{frame}
413
414\begin{frame}[fragile]
415\frametitle{IPC}
416
417\begin{tikzpicture}
418\begin{axis}[
419xtick=data,
420ylabel=Instructions per Cycle,
421xticklabels={@,Date,Email,URI,Hex,StarHeight},
422tick label style={font=\tiny},
423enlarge x limits=0.15,
424%enlarge y limits={0.15, upper},
425ymin=0,
426legend style={at={(0.5,0.9)},
427anchor=north,legend columns=-1},
428ybar,
429bar width=7pt,
430]
431\addplot
432file {../data/ipc-bitstreams.dat};
433\addplot
434file {../data/ipc-nrgrep112.dat};
435\addplot
436file {../data/ipc-gre2p.dat};
437
438\legend{bitstreams,nrgrep,gre2p,Annot}
439\end{axis}
440\end{tikzpicture}
441\end{frame}
442
443
444\begin{frame}[fragile]
445\frametitle{SIMD Scalability}
446\begin{tikzpicture}
447\begin{axis}[
448xtick=data,
449ylabel=AVX2 Instruction Reduction,
450xticklabels={@,Date,Email,URI,Hex,StarHeight},
451tick label style={font=\tiny},
452enlarge x limits=0.15,
453%enlarge y limits={0.15, upper},
454ymin=0,
455legend style={at={(0.5,0.15)},
456anchor=north,legend columns=-1},
457ybar,
458bar width=7pt,
459]
460\addplot
461file {../data/sse2-avx2-instr-red-bitstreams.dat};
462\addplot
463file {../data/sse2-avx2-instr-red-nrgrep112.dat};
464\addplot
465file {../data/sse2-avx2-instr-red-gre2p.dat};
466
467\legend{bitstreams,nrgrep,gre2p,Annot}
468\end{axis}
469\end{tikzpicture}
470\end{frame}
471
472
473\begin{frame}[fragile]
474\frametitle{Speedups Achieved}
475
476\begin{center}
477\begin{tabular}{|c|c|c|c|} \hline
478\multirow{2}{*}{Expression} & \multicolumn{3}{c|}{Bitstream/AVX2 grep Speedup} \\ \cline{2-4}
479& vs. nrgrep & vs. gre2p & vs. GNU egrep \\ \hline \hline
480At & 3.5X & 34X & 1.6X\\ \hline
481Date & 0.76X & 13X & 48X\\ \hline
482Email & 9.5X & 28X & 12X\\ \hline
483URI & 6.6X & 27X & 518X\\ \hline
484Hex & 8.1X & 105X & 267X\\ \hline
485StarHeight & 1.9X & 7.6X & 97X\\ \hline
486\end{tabular}
487\end{center}
488\end{frame}
489
490
491\begin{frame}[fragile]
492\frametitle{GPU Performance}
493\begin{tikzpicture}
494\begin{axis}[
495xtick=data,
496ylabel=Running Time (ms per megabyte),
497xticklabels={@,Date,Email,URI,Hex,StarHeight},
498tick label style={font=\tiny},
499enlarge x limits=0.15,
500%enlarge y limits={0.15, upper},
501ymin=0,
502legend style={at={(0.5,0.8)},
503anchor=north,legend columns=-1},
504ybar,
505bar width=7pt,
506]
507\addplot
508file {../data/ssetime.dat};
509\addplot
510file {../data/avxtime.dat};
511\addplot
512file {../data/gputime.dat};
513
514\legend{SSE2,AVX2,GPU,Annot}
515\end{axis}
516\end{tikzpicture}
517
518
519\end{frame}
520
521\section{Conclusion}
522\begin{frame}[fragile]
523\frametitle{Results}
524\begin{itemize}%[<+->]
525\item A new class of parallel regular expression algorithms has been introduced
526based on the concept of bitwise data parallelism and MatchStar.
527\item Single core acceleration over sequential implementations can be dramatic.
528\item A long-stream addition technique has been developed to allow MatchStar to
529scale directly with SIMD instruction width.
530\item Perfect scaling in instruction count was observed with 256-bit AVX2
531technology versus 128-bit SIMD technology except for nested repetition.
532\item GPU implementations show promise, but need additional work.
533\end{itemize}
534\end{frame}
535
536\begin{frame}[fragile]
537\frametitle{Ongoing/Future Work}
538
539\begin{itemize}%[<+->]
540\item The prototype technologies have now been re-implemented in
541a single C++ executable combining 4 compilers.
542\begin{itemize}
543\item CCC: Character class compiler
544\item RE\_compile: regular expression compiler
545\item Pablo: Block-at-a-time compiler
546\item LLVM: Fully dynamic code generation.
547\end{itemize}
548\item Compilation overhead is high, but tolerable for large files.
549
550\item Unicode support has been added, including additional MatchStar
551algorithms for variable-length Unicode character classes.
552
553\item Open source implementation available: {\tt http://parabix.costar.sfu.ca/svn/icGREP/}
554\end{itemize}
555
556\end{frame}
557
558
559
560\end{document}
Note: See TracBrowser for help on using the repository browser.