source: docs/Working/icGrep/pres.tex @ 4858

Last change on this file since 4858 was 4858, checked in by lindanl, 4 years ago

For presentaion.

File size: 20.9 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{comment}
4\usepackage{color}
5\usepackage{subfigure}
6\usepackage[T1]{fontenc} 
7\usepackage[utf8]{inputenc}
8\usepackage{CJK}
9\usepackage{verbatim}
10\usepackage{colortbl}
11\usepackage{tikz}
12\usetikzlibrary{shapes,positioning,arrows,calc,fadings}
13\usepackage{pgfplots}
14\usepackage{pgfplotstable}
15%\pgfplotsset{compat=1.10}
16
17\usepackage{bbding}
18
19\newcommand{\icGrep}[1]{icGrep}
20
21\def\RegularExpression{RegEx}
22\def\Pablo{Parabix}
23\def\CodeUnit{Code Unit}
24\def\REParser{\RegularExpression{} Parser}
25\def\CodeUnitCompiler{\CodeUnit{} Compiler}
26\def\RegularExpressionCompiler{\RegularExpression{} Compiler}
27\def\PabloCompiler{\Pablo{} Compiler}
28
29\usetheme{Madrid}
30
31
32\title[ICA3PP]{Bitwise Data Parallelism with LLVM: The ICgrep Case Study}
33\author[Rob Cameron]{Robert D. Cameron}
34\institute[SFU]{School of Computing Science\\
35Simon Fraser University}
36\date{October 28, 2015}
37
38\begin{document}
39
40\begin{frame}
41\titlepage
42\end{frame}
43
44
45%\begin{frame}
46%\frametitle{Outline}
47%\tableofcontents%[pausesections]
48%\end{frame}
49
50\section{Motivation}
51\begin{frame}[fragile]
52\frametitle{Overview}
53\begin{block}<+->{The Parabix Regular Expression Project}
54
55\begin{itemize}
56\item Goal: building a modern, ultra-fast, Unicode-compliant, regular expression engine and
57  application framework.
58\item New algorithmic approach: bitwise data parallel matching.
59\item Performance target: Gigabyte/second regexp processing.
60\item Status: icgrep 1.0 is a full Unicode Level 1 grep replacement.
61\item Current work: Unicode Level 2 Regular Expression Support
62\item Longer-term: high-level application framework.
63\end{itemize}
64\end{block}
65\end{frame}
66
67
68\begin{frame}
69\frametitle{Performance Matters - Big Text}
70\begin{block}<+->{Text Analytics}
71\begin{itemize}
72\item 
73Text Analytics applications: very large textual data
74sets.
75\item Gigabyte to Terabyte scale.
76\item Regular expressions are often used in these applications.
77\item If performance were better, regular expressions could be applied
78  more widely.
79\end{itemize}
80\end{block}
81\begin{block}<+->{IBM System T Study}
82\begin{itemize}
83\item 
84Five customer workloads studied
85\item
86Four of five workloads dominated by regexp processing.
87\item 
8860\%-75\% of total execution time.
89\item
90Polig {\it et al.} 
91Giving text analytics a boost. {\em IEEE Micro}, {\bf 34}(4):6--14,
922014.
93\end{itemize}
94\end{block}
95\end{frame}
96\begin{frame}[fragile]
97\frametitle{Performance Matters - Catastrophic Backtracking}
98\begin{block}<+->{Nondeterministic Regular Expression Features}
99\begin{itemize}
100\item Ambiguity: regexes may match strings in multiple ways.
101\item Ex: match(\verb:^(\p{L})+t(\p{L})+$:, ``\verb:statistics:'')
102\item (\verb:s:, \verb:atistics:)(\verb:sta:, \verb:istics:), or
103  (\verb:statis:, \verb:ics:)
104\item Backtracking engines: potential runaway behaviour.
105\item Procedural workarounds:  \verb:^(\p{L})++t(\p{L})+$:  or \verb:^(?>(\p{L})+?)t(\p{L})+$:
106\item Declarative nature of regexps diminished.
107\end{itemize}
108\end{block}
109
110\begin{block}<+->{Web Site Security}
111\begin{itemize}
112\item Regular expressions may be used in web services.
113\item Malicious agents may inject ill-formed regular expressions.
114\item Web sites may be tied up (denial-of-service) or compromised.
115\end{itemize}
116\end{block}
117\end{frame}
118
119\begin{frame}[fragile]
120
121\frametitle{Much Better Performance is Possible}
122Unicode regular expression matching can be ultra-fast and robust.
123
124\begin{block}<+->{Example: email regex with Unicode properties}
125\begin{itemize}
126\item 
127\verb:([^\p{Z}<]+@[\p{L}\p{M}\p{N}.-]+\.:
128\verb:      (\p{L}\p{M}*){2,6})(>|\p{Z}|$):
129\item Compare time in CPU cycles per byte for 4 grep programs.
130\item Data source: 620 MB Wikibooks document set (15 languages)
131\item Processor: Intel Core i7-2600 CPU @ 3.40GHz
132\end{itemize}
133\end{block}
134
135%\item ugrep541: 549.5 cycles/byte
136%\item pcre2grep: 281.7 cycles/byte
137%\item icgrep4750: 4.2 cycles/byte
138%\item Speedup: 67X-130X reaches 810 MB/s
139\begin{block}<+->{Performance Results}
140\begin{center}
141\begin{tabular}{|c|r|r|r|r|} \hline
142Program & Elapsed sec & Cycles/Byte & Megabytes/sec \\ \hline
143ugrep561 &      180.7   & 1024.0 &      3.60 \\
144pcre2grep & 144.0       & 815.5 & 4.51  \\
145grep210  -P & 87.3      & 498.2 & 7.45 \\
146icgrep4750 &1.06 &      6.05 & 613.0 \\ \hline
147\end{tabular}
148\end{center}
149\end{block}
150\end{frame}
151
152
153\section{Bitwise Data Parallel Regular Expression Matching}
154\begin{frame}
155\frametitle{How It Works}
156\begin{center}
157\Huge Bitwise Data Parallel Regular Expression Matching
158\end{center}
159\end{frame}
160
161\begin{frame}[fragile]
162\frametitle{Beyond Byte-At-A-Time}
163\begin{itemize}[<+->]
164\item Traditional regular expression technology processes one code
165  unit at a time using DFA, NFA or backtracking implementations.
166\item Instead consider a bitwise data parallel approach.
167\item Byte-oriented data is first transformed to 8 parallel bit
168  streams. Bit stream $j$ consists of bit $j$ of each byte.
169\end{itemize}
170
171\begin{tabular}{cl}\\
172\onslide<3->input data  & \verb`a45a... (01100001 00110100 00110101 01100001...)`\\
173\onslide<4->$bit0$ & \verb`0000...`\\
174\onslide<5->$bit1$ & \verb`1001...`\\
175\onslide<6->$bit2$ & \verb`1111...`\\
176\onslide<6->$bit3$ & \verb`0110...`\\
177\onslide<6->$bit4$ & \verb`0000...`\\
178\onslide<6->$bit5$ & \verb`0110...`\\
179\onslide<6->$bit6$ & \verb`0000...`\\
180\onslide<6->$bit7$ & \verb`1011...`
181\end{tabular}
182\end{frame}
183
184
185\begin{frame}
186\frametitle{Beyond Byte-At-A-Time}
187\begin{itemize}
188\item Load 128-bit SIMD registers to process 128 positions at a time
189in bitwise data parallel fashion (SSE2, ARM Neon, ...).
190\item Or use 256-bit AVX2 registers of newer Intel processors.
191\item Process using bitwise logic, shifting and addition.
192\item Parabix methods have previously been used to accelerate
193Unicode transcoding and XML parsing.
194\end{itemize}
195
196\end{frame}
197
198
199\begin{frame}[fragile]
200\frametitle{Character Class Formation}
201\begin{itemize}
202\item Combining 8 bits of a code unit gives a character class stream.
203\item CCC({\tt cc\_a = [a]})
204\item 
205\begin{semiverbatim}
206temp1 = (bit1 &~ bit0)
207temp2 = (bit2 &~ bit3)
208temp3 = (temp1 & temp2)
209temp4 = (bit4 | bit5)
210temp5 = (bit7 &~ bit6)
211temp6 = (temp5 &~ temp4)
212cc\_a = (temp3 & temp6)
213\end{semiverbatim}
214\end{itemize}
215
216\begin{tabular}{cl}\\
217input data  & \verb`a45a...`\\
218\onslide$cc\_a$ & \verb`1001...`\\
219\end{tabular}
220
221\end{frame}
222
223\begin{frame}[fragile]
224\frametitle{Character Class Ranges}
225\begin{itemize}
226\item Ranges of characters are often very simple to compute.
227\item CCC({\tt cc\_0\_9 = [0-9]})
228\item 
229\begin{semiverbatim}
230temp7 = (bit0 | bit1)
231temp8 = (bit2 & bit3)
232temp9 = (temp8 &~ temp7)
233temp10 = (bit5 | bit6)
234temp11 = (bit4 & temp10)
235cc\_0\_9 = (temp9 &~ temp11)
236\end{semiverbatim}
237\end{itemize}
238\end{frame}
239
240
241\begin{frame}[fragile]
242\frametitle{Character Class Common Subexpressions}
243\begin{itemize}
244\item Multiple definitions use common subexpressions.
245\item CCC({\tt cc\_z9 = [z9]})
246\item 
247\begin{semiverbatim}
248temp12 = (bit4 &~ bit5)
249temp13 = (temp12 & temp5)
250temp14 = (temp9 & temp13)
251temp15 = (temp1 & temp8)
252temp16 = (bit6 &~ bit7)
253temp17 = (temp12 & temp16)
254temp18 = (temp15 & temp17)
255cc\_z9 = (temp14 | temp18)
256\end{semiverbatim}
257\end{itemize}
258\end{frame}
259
260
261\begin{frame}[fragile]
262\frametitle{Marker Streams}
263\begin{itemize}
264\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
265\item Each marker stream $M_i$ has one bit for every input byte in the input file.
266\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
267including item $i$ in the expression occurs at position $j-1$ in the input stream.
268\item Conceptually, marker streams are computed in parallel for all positions in the file
269at once (bitwise data parallelism).
270\item In practice, marker streams are computed block-by-block, where the
271block size is the size of a SIMD register in bits.
272\end{itemize}
273\end{frame}
274
275\begin{frame}[fragile]
276\frametitle{Marker Stream Example}
277\begin{itemize}[<+->]
278\item Consider matching regular expression \verb`a[0-9]*[z9]` against the input text below.
279\item<2->$M_1$ marks positions after occurrences of \verb:a:.
280\item<3->$M_2$ marks positions after occurrences of \verb:a[0-9]*:.
281\item<4->$M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
282\end{itemize}
283
284\begin{tabular}{cr}\\
285input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
286\onslide<2->$M_1$ & \verb`.1...........1...1.........1.....`\\
287\onslide<3->$M_2$ & \verb`.1111........1...111111....111...`\\
288\onslide<4->$M_3$ & \verb`.....1........1.....1.11......1..`
289\end{tabular}
290\end{frame}
291
292
293
294\begin{frame}[fragile]
295\frametitle{Matching Character Class Repetitions with MatchStar}
296\begin{itemize}[<+->]
297\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C)  \oplus C) \vee M$
298\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
299\item<4->Use addition to scan each marker through the class.
300\item<5->Bits that change represent matches.
301\item<6->We also have matches at start positions in $M_1$.
302\end{itemize}
303
304\begin{tabular}{cr}\\
305\onslide<3->input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
306\onslide<3->$M_1$ & \verb`.1...........1...1.........1.....`\\
307\onslide<3->$C = \text{\tt [0-9]}$ & \verb`.111....1........11111.....11.1..`\\
308\onslide<4->$T_0 = M_1 \wedge C$ & \verb`.1...............1.........1.....`\\
309\onslide<4->$T_1 = T_0 + C$ & \verb`....1...1.............1......11..`\\
310\onslide<5->$T_2 = T_1 \oplus C$ & \verb`.1111............111111....111...`\\
311\onslide<6->$M_2 = T_2 \vee M_1$ & \verb`.1111........1...111111....111...`
312\end{tabular}
313\end{frame}
314
315
316\begin{frame}[fragile]
317\frametitle{Matching Equations}
318The rules for bitwise data parallel regular expression matching can
319be summarized by these equations.
320\begin{eqnarray*}
321\mbox{Match}(m, C) & = &  \mbox{Advance}(\mbox{CharClass}(C) \wedge m) \\
322\mbox{Match}(m, RS) & = &  \mbox{Match}(\mbox{Match}(m, R), S) \\
323\mbox{Match}(m, R|S) & = &  \mbox{Match}(m, R) \vee \mbox{Match}(m, S)) \\
324\mbox{Match}(m, C*) & = &  \mbox{MatchStar}(m, \mbox{CharClass}(C)) \\
325\mbox{Match}(m, R*) & = &  m \vee \mbox{Match}(\mbox{Match}(m, R), R*) \\
326\mbox{Advance}(m) & = & m+m \\
327\mbox{MatchStar}(m, C) & = & (((m \wedge C) + C)  \oplus C) \vee m
328\end{eqnarray*}
329The recursive equation is implemented with a while loop.
330\end{frame}
331
332\begin{frame}[fragile]
333\frametitle{Simple UTF-8 Matching Operations}
334
335
336\begin{CJK}{UTF8}{gbsn}
337\begin{itemize}
338\item Search for 䜠奜 in UTF-8 text.
339\item Let ni3 represent the three byte sequence for 䜠.
340\item Let hao represent the three byte sequence for 奜。
341\item Let men represent the three byte sequence for 们。
342\end{itemize}
343\end{CJK}
344\begin{center}
345%\begin{tabular}{cr}\\
346\begin{tabular}{c@{\hspace{1em}}r}\\
347input data                                                         & \verb`ni3hao(Hello),ni3men(You),`\\
348$\text{CC}_{\text{ni3}}$                                           & \verb`..1.............1.........`\\
349$\text{CC}_{\text{hao}}$                                           & \verb`.....1....................`\\
350$m_0 = \mbox{\rm Initial}$                                         & \verb`1..1..111111111..1..111111`\\
351NonFinal                                                           & \verb`11.11.........11.11.......`\\
352$t_1 = \text{ScanThru}(m_0, \text{NonFinal})$                      & \verb`..1..111111111..1..1111111`\\
353$m_1 = \text{Advance}(t_1 \land \text{CC}_{\text{ni3}})$           & \verb`...1.............1........`\\
354$t_2 = \text{ScanThru}(m_1, \text{NonFinal})$                      & \verb`.....1.............1......`\\
355$m_2 = \text{Advance}(t_2 \land CC_{\text{hao}})$                  & \verb`......1...................`
356\end{tabular}
357\end{center}
358
359\end{frame}
360
361\begin{frame}[fragile]
362\frametitle{If Hierarchies}
363
364\begin{itemize}
365\item Unicode property classes may contain many thousands of codepoints.
366\item Evaluating all required equations may be very expensive.
367\item However, any 128-byte segment of input will involve only a few codepoint ranges.
368\item Evaluation of property classes is embedded in if-hierarchies of successively
369finer ranges.
370\item This techique greatly reduces the number of equations evaluated for each property.
371\end{itemize}
372
373\end{frame}
374
375\section{Architecture}
376\begin{frame}
377\frametitle{Putting It All Together}
378\begin{center}
379\Huge Architecture
380\end{center}
381\end{frame}
382
383\begin{frame}[fragile]
384\frametitle{Compilation Architecture}
385\begin{center}
386
387\pgfdeclarelayer{phases}
388\pgfsetlayers{phases,main}
389\tikzfading[name=fade down,
390  top color=transparent!0, bottom color=transparent!100]
391
392% Define block styles
393%\tikzstyle{decision} = [diamond, shape aspect=2, rotate=30, draw, text width=4.5em, text badly centered, inner sep=0pt, thick]
394\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.5em, thick, font=\ttfamily\bfseries, node distance=2.5em]
395\tikzstyle{line} = [draw, ->, line width=1.4pt]
396\tikzstyle{separator} = [draw, line width=0.125em, dashed]
397\tikzset{block/.append style={execute at begin node=\footnotesize}}   
398\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
399
400    % Place nodes
401    \node [draw=none] (RE) {\RegularExpression{}};
402    \node [block, below of=RE] (REParser) {\REParser{}};
403    \node [block, below of=REParser] (RETransform) {\RegularExpression{} Transformations};   
404%    \coordinate[below of=RETransform, node distance=3em] (Point);   
405%    \node [block, left of=Point, node distance=10em] (CUCompiler) {\CodeUnitCompiler{}};
406%    \node [block, right of=Point, node distance=10em] (RECompiler) {\RegularExpressionCompiler{}};
407    \node [block, below of=RETransform] (RECompiler) {\RegularExpressionCompiler{}};
408    \node [block, below of=RECompiler] (PabloTransform) {\Pablo{} Transformations};   
409    \node [block, below of=PabloTransform] (PabloCompiler) {\PabloCompiler{}};
410    \node [block, below of=PabloCompiler] (LLVMCompiler) {LLVM Compiler};
411    \node [draw=none, below of=LLVMCompiler, node distance=3.5em] (Matcher) {Dynamically-Generated Match Function};
412   
413    % Draw edges
414    \path [line] (RE) -- (REParser);
415    %\path [line] (RE) -- (PropertyExtraction);
416    \path [line] (REParser) -- (RETransform);
417%    \path [line] (RETransform) -| (CUCompiler);
418%    \path [line] (RETransform) -| (RECompiler);
419%    \path [line] (CUCompiler) |- (PabloTransform);
420 %   \path [line] (RECompiler) |- (PabloTransform);
421    \path [line] (RETransform) -- (RECompiler);
422    \path [line] (RECompiler) -- (PabloTransform);
423    \path [line] (PabloTransform) -- (PabloCompiler);
424    \path [line] (PabloCompiler) -- (LLVMCompiler);
425    \path [line] (LLVMCompiler) -- (Matcher);
426   
427    % Draw layers
428    \coordinate[right of=REParser, node distance=15em] (SR);
429    \coordinate[left of=REParser, node distance=15em] (SL);
430    \path [separator] (SL) -- (REParser);
431    \path [separator] (REParser) -- (SR);
432   
433    \coordinate[left of=RECompiler, node distance=15em] (PL);
434    \coordinate[right of=RECompiler, node distance=15em] (PR);
435    \path [separator] (PL) -- (RECompiler);
436    \path [separator] (RECompiler) -- (PR);
437    %\path [separator] (PL) -- (CUCompiler);
438%    \path [separator] (CUCompiler) -- (RECompiler);
439    %\path [separator] (RECompiler) -- (PR);
440
441    \coordinate[right of=PabloCompiler, node distance=15em] (LR);
442    \coordinate[left of=PabloCompiler, node distance=15em] (LL);
443    \path [separator] (LL) -- (PabloCompiler);
444    \path [separator] (PabloCompiler) -- (LR);   
445   
446    \coordinate[right of=LLVMCompiler, node distance=15em] (OR);
447    \coordinate[left of=LLVMCompiler, node distance=15em] (OL);
448    \path [separator] (OL) -- (LLVMCompiler);
449    \path [separator] (LLVMCompiler) -- (OR);       
450   
451    % Seperator text
452    \node [draw=none,anchor=west] at ($(SL)!0.5!(PL)$) {1)~\RegularExpression{} AST};
453    \node [draw=none,anchor=west] at ($(PL)!0.5!(LL)$) {2)~\Pablo{}};
454    \node [draw=none,anchor=west] at ($(LL)!0.5!(OL)$) {3)~LLVM};
455
456    \begin{pgfonlayer}{phases}
457        \path[fill=green!20,path fading=fade down, draw=none] (SL) rectangle (PR);
458        \path[fill=blue!20,path fading=fade down, draw=none] (PL) rectangle (LR);
459        \path[fill=red!20,path fading=fade down, draw=none] (LL) rectangle (OR);
460    \end{pgfonlayer}
461
462\end{tikzpicture}
463
464\end{center}
465\end{frame}
466
467\begin{frame}[fragile]
468\frametitle{Execution Architecture}
469
470\begin{center}
471
472\pgfdeclarelayer{threads}
473\pgfdeclarelayer{components}
474\pgfsetlayers{threads,main}
475
476\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
477\tikzstyle{line} = [draw, ->, line width=1.4pt]
478\tikzstyle{separator} = [draw, line width=0.125em, dashed]
479\tikzstyle{thread} = [rectangle, corners=rounded, draw, text width=15em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
480\tikzset{block/.append style={execute at begin node=\footnotesize}}   
481\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
482
483    % Place nodes
484    \node [draw=none] (InputData) {Streaming Input Data};
485    \node [block, below of=InputData] (S2P) {Transposition};
486    \node [block, below of=S2P] (RequiredStreamsGenerator) {Required Streams Generator};
487    \node [block, below of=RequiredStreamsGenerator] (JITFunction) {Dynamic Matcher};
488    \node [block, right of=JITFunction, node distance=16em] (NamedPropertyLibrary) {Named Property Library};
489    \node [block, below of=JITFunction] (MatchScanner) {Match Scanner};
490    \node [draw=none, below of=MatchScanner, node distance=3.5em] (OutputResult) {Streaming Output Result};
491   
492    % Draw edges
493    \path [line] (InputData) -- (S2P);
494    \path [line] (S2P) -- (RequiredStreamsGenerator);
495    \path [line] (RequiredStreamsGenerator) -- (JITFunction);
496    \path [line] (NamedPropertyLibrary) -- (JITFunction);
497    \path [line] (JITFunction) -- (MatchScanner);
498    \path [line] (MatchScanner) -- (OutputResult);
499   
500    \begin{pgfonlayer}{threads}
501        \path (S2P.north west)+(-.1,.5) node (a) {};
502        \path (S2P.south east)+(+1.7,-0.1) node (b) {};
503        \path[fill=green!20,rounded corners, draw=green, solid] (a) rectangle (b);
504        \node [draw=none,above=-0.04cm of S2P.north east] (t1) {\small Transposition Thread};
505
506        \path (RequiredStreamsGenerator.north west)+(-.1,.38) node (a) {};
507        \path (RequiredStreamsGenerator.south east)+(+2,-0.1) node (b) {};
508        \path[fill=blue!20,rounded corners, draw=blue, solid] (a) rectangle (b);
509        \node [draw=none,above=-0.04cm of RequiredStreamsGenerator.north east] (t1) {\small Stream Generator Thread};
510
511        \path (JITFunction.north west)+(-.1,.38) node (a) {};
512        \path (NamedPropertyLibrary.east |- MatchScanner.south)+(+.1,-0.1) node (b) {};
513        \path[fill=red!20,rounded corners, draw=red, solid] (a) rectangle (b);
514        \node [draw=none,above=-0.04cm of JITFunction.north east] (t1) {\small Matcher Thread};
515
516    \end{pgfonlayer}{threads}
517\end{tikzpicture}
518
519\end{center}
520\end{frame}
521
522
523\begin{frame}
524\frametitle{Performance Study Set-up}
525\begin{itemize}
526\item Search for lines containing characters in computed sets.
527\item Set expressions combine general category and script properties.
528\item Set intersection, union, difference, negation.
529\item Cover all general categories and scripts.
530\item 246 test expressions in all.
531\item Evaluate icgrep 1.0 vs. ugrep541 and pcre2grep.
532\item Measure performance against a large collection of Wikimedia
533  documents.
534\end{itemize}
535\end{frame}
536
537\begin{frame}
538\frametitle{Performance Comparison}
539\begin{center}
540\pgfplotstableread[col sep = comma]{data/icgrep-scatter.csv}\icgrep
541\pgfplotstableread[col sep = comma]{data/ugrep541-scatter.csv}\ugrep
542\pgfplotstableread[col sep = comma]{data/pcre2grep-scatter.csv}\pcre
543
544\begin{tikzpicture}
545\begin{axis}[
546grid=both,
547x tick label style={/pgf/number format/1000 sep=},
548ylabel={CPU Cycles Per Byte},
549xlabel={Percentage of Matching Lines},
550minor y tick num={1},
551xmax=100,
552height=0.5\textwidth,
553legend style={at={(1.05,0.5)},
554anchor=west,legend columns=1,
555align=left,draw=none,column sep=2ex}
556]
557\addplot+[no markers,line width=2pt,color=blue!60,solid] table {\icgrep};
558\addplot+[no markers,line width=2pt,color=red!60,solid] table {\ugrep};
559\addplot+[no markers,line width=2pt,color=brown!60,solid] table {\pcre};
560\legend{icgrep1.0,ugrep541,pcre2grep}
561\end{axis}
562
563
564
565
566\end{tikzpicture}
567\end{center}
568%\caption{Matching Performance for Simple Property Expressions}\label{fig:property_test}
569\end{frame}
570
571\begin{frame}
572\frametitle{Performance Comparison (Multithread)}
573
574\begin{itemize}
575\item Pipeline parallelism
576\item Up to 40\% speedup by hiding transposition and required stream generation.
577\item Combining the AVX2 ISA with multithreading gives and average overall 61\%
578\item Future work
579\begin{itemize} 
580\item Pipeline + data parallelism
581\item AVX-512 ISA
582\end{itemize}
583\end{itemize}
584\end{frame}
585
586\section{Conclusion}
587\begin{frame}[fragile]
588\frametitle{Results}
589\begin{itemize}%[<+->]
590\item Regular expression matching using bitwise data parallelism is well-suited
591to Unicode regular expression matching requirements.
592
593\item Performance is ultra-fast and robust against all kinds of nondeterminstic
594regular expressions.
595
596\item Performance improves as SIMD processor width increases.
597
598
599\item Open source implementation available: {\tt http://parabix.costar.sfu.ca/wiki/ICgrep}
600\end{itemize}
601
602\end{frame}
603
604
605\end{document}
Note: See TracBrowser for help on using the repository browser.