source: docs/Working/icGrep/pres.tex @ 4869

Last change on this file since 4869 was 4869, checked in by cameron, 4 years ago

Fix authors, date

File size: 20.9 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{comment}
4\usepackage{color}
5\usepackage{subfigure}
6\usepackage[T1]{fontenc} 
7\usepackage[utf8]{inputenc}
8\usepackage{CJK}
9\usepackage{verbatim}
10\usepackage{colortbl}
11\usepackage{tikz}
12\usetikzlibrary{shapes,positioning,arrows,calc,fadings}
13\usepackage{pgfplots}
14\usepackage{pgfplotstable}
15%\pgfplotsset{compat=1.10}
16
17\usepackage{bbding}
18
19\newcommand{\icGrep}[1]{icGrep}
20
21\def\RegularExpression{RegEx}
22\def\Pablo{Parabix}
23\def\CodeUnit{Code Unit}
24\def\REParser{\RegularExpression{} Parser}
25\def\CodeUnitCompiler{\CodeUnit{} Compiler}
26\def\RegularExpressionCompiler{\RegularExpression{} Compiler}
27\def\PabloCompiler{\Pablo{} Compiler}
28
29\usetheme{Madrid}
30
31
32\title[ICA3PP]{Bitwise Data Parallelism with LLVM: The ICgrep Case Study}
33\author[Rob Cameron]{Robert D. Cameron\and Nigel Medforth\and Dan Lin
34\and Dale Denis
35\and William N. Sumner}
36\institute[SFU]{School of Computing Science\\
37Simon Fraser University}
38\date{November 18, 2015}
39
40\begin{document}
41
42\begin{frame}
43\titlepage
44\end{frame}
45
46
47%\begin{frame}
48%\frametitle{Outline}
49%\tableofcontents%[pausesections]
50%\end{frame}
51
52\section{Motivation}
53\begin{frame}[fragile]
54\frametitle{Overview}
55\begin{block}<+->{The Parabix Regular Expression Project}
56
57\begin{itemize}
58\item Goal: building a modern, ultra-fast, Unicode-compliant, regular expression engine and
59  application framework.
60\item New algorithmic approach: bitwise data parallel matching.
61\item Performance target: Gigabyte/second regexp processing.
62\item Status: icgrep 1.0 is a full Unicode Level 1 grep replacement.
63\item Current work: Unicode Level 2 Regular Expression Support
64\item Longer-term: high-level application framework.
65\end{itemize}
66\end{block}
67\end{frame}
68
69
70\begin{frame}
71\frametitle{Performance Matters - Big Text}
72\begin{block}<+->{Text Analytics}
73\begin{itemize}
74\item 
75Text Analytics applications: very large textual data
76sets.
77\item Gigabyte to Terabyte scale.
78\item Regular expressions are often used in these applications.
79\item If performance were better, regular expressions could be applied
80  more widely.
81\end{itemize}
82\end{block}
83\begin{block}<+->{IBM System T Study}
84\begin{itemize}
85\item 
86Five customer workloads studied
87\item
88Four of five workloads dominated by regexp processing.
89\item 
9060\%-75\% of total execution time.
91\item
92Polig {\it et al.} 
93Giving text analytics a boost. {\em IEEE Micro}, {\bf 34}(4):6--14,
942014.
95\end{itemize}
96\end{block}
97\end{frame}
98\begin{frame}[fragile]
99\frametitle{Performance Matters - Catastrophic Backtracking}
100\begin{block}<+->{Nondeterministic Regular Expression Features}
101\begin{itemize}
102\item Ambiguity: regexes may match strings in multiple ways.
103\item Ex: match(\verb:^(\p{L})+t(\p{L})+$:, ``\verb:statistics:'')
104\item (\verb:s:, \verb:atistics:)(\verb:sta:, \verb:istics:), or
105  (\verb:statis:, \verb:ics:)
106\item Backtracking engines: potential runaway behaviour.
107\item Procedural workarounds:  \verb:^(\p{L})++t(\p{L})+$:  or \verb:^(?>(\p{L})+?)t(\p{L})+$:
108\item Declarative nature of regexps diminished.
109\end{itemize}
110\end{block}
111
112\begin{block}<+->{Web Site Security}
113\begin{itemize}
114\item Regular expressions may be used in web services.
115\item Malicious agents may inject ill-formed regular expressions.
116\item Web sites may be tied up (denial-of-service) or compromised.
117\end{itemize}
118\end{block}
119\end{frame}
120
121\begin{frame}[fragile]
122
123\frametitle{Much Better Performance is Possible}
124Unicode regular expression matching can be ultra-fast and robust.
125
126\begin{block}<+->{Example: email regex with Unicode properties}
127\begin{itemize}
128\item 
129\verb:([^\p{Z}<]+@[\p{L}\p{M}\p{N}.-]+\.:
130\verb:      (\p{L}\p{M}*){2,6})(>|\p{Z}|$):
131\item Compare time in CPU cycles per byte for 4 grep programs.
132\item Data source: 620 MB Wikibooks document set (15 languages)
133\item Processor: Intel Core i7-2600 CPU @ 3.40GHz
134\end{itemize}
135\end{block}
136
137%\item ugrep541: 549.5 cycles/byte
138%\item pcre2grep: 281.7 cycles/byte
139%\item icgrep4750: 4.2 cycles/byte
140%\item Speedup: 67X-130X reaches 810 MB/s
141\begin{block}<+->{Performance Results}
142\begin{center}
143\begin{tabular}{|c|r|r|r|r|} \hline
144Program & Elapsed sec & Cycles/Byte & Megabytes/sec \\ \hline
145ugrep561 &      180.7   & 1024.0 &      3.60 \\
146pcre2grep & 144.0       & 815.5 & 4.51  \\
147grep210  -P & 87.3      & 498.2 & 7.45 \\
148icgrep4750 &1.06 &      6.05 & 613.0 \\ \hline
149\end{tabular}
150\end{center}
151\end{block}
152\end{frame}
153
154
155\section{Bitwise Data Parallel Regular Expression Matching}
156\begin{frame}
157\frametitle{How It Works}
158\begin{center}
159\Huge Bitwise Data Parallel Regular Expression Matching
160\end{center}
161\end{frame}
162
163\begin{frame}[fragile]
164\frametitle{Beyond Byte-At-A-Time}
165\begin{itemize}[<+->]
166\item Traditional regular expression technology processes one code
167  unit at a time using DFA, NFA or backtracking implementations.
168\item Instead consider a bitwise data parallel approach.
169\item Byte-oriented data is first transformed to 8 parallel bit
170  streams. Bit stream $j$ consists of bit $j$ of each byte.
171\end{itemize}
172
173\begin{tabular}{cl}\\
174\onslide<3->input data  & \verb`a45a... (01100001 00110100 00110101 01100001...)`\\
175\onslide<4->$bit0$ & \verb`0000...`\\
176\onslide<5->$bit1$ & \verb`1001...`\\
177\onslide<6->$bit2$ & \verb`1111...`\\
178\onslide<6->$bit3$ & \verb`0110...`\\
179\onslide<6->$bit4$ & \verb`0000...`\\
180\onslide<6->$bit5$ & \verb`0110...`\\
181\onslide<6->$bit6$ & \verb`0000...`\\
182\onslide<6->$bit7$ & \verb`1011...`
183\end{tabular}
184\end{frame}
185
186
187\begin{frame}
188\frametitle{Beyond Byte-At-A-Time}
189\begin{itemize}
190\item Load 128-bit SIMD registers to process 128 positions at a time
191in bitwise data parallel fashion (SSE2, ARM Neon, ...).
192\item Or use 256-bit AVX2 registers of newer Intel processors.
193\item Process using bitwise logic, shifting and addition.
194\item Parabix methods have previously been used to accelerate
195Unicode transcoding and XML parsing.
196\end{itemize}
197
198\end{frame}
199
200
201\begin{frame}[fragile]
202\frametitle{Character Class Formation}
203\begin{itemize}
204\item Combining 8 bits of a code unit gives a character class stream.
205\item CCC({\tt cc\_a = [a]})
206\item 
207\begin{semiverbatim}
208temp1 = (bit1 &~ bit0)
209temp2 = (bit2 &~ bit3)
210temp3 = (temp1 & temp2)
211temp4 = (bit4 | bit5)
212temp5 = (bit7 &~ bit6)
213temp6 = (temp5 &~ temp4)
214cc\_a = (temp3 & temp6)
215\end{semiverbatim}
216\end{itemize}
217
218\begin{tabular}{cl}\\
219input data  & \verb`a45a...`\\
220\onslide$cc\_a$ & \verb`1001...`\\
221\end{tabular}
222
223\end{frame}
224
225\begin{frame}[fragile]
226\frametitle{Character Class Ranges}
227\begin{itemize}
228\item Ranges of characters are often very simple to compute.
229\item CCC({\tt cc\_0\_9 = [0-9]})
230\item 
231\begin{semiverbatim}
232temp7 = (bit0 | bit1)
233temp8 = (bit2 & bit3)
234temp9 = (temp8 &~ temp7)
235temp10 = (bit5 | bit6)
236temp11 = (bit4 & temp10)
237cc\_0\_9 = (temp9 &~ temp11)
238\end{semiverbatim}
239\end{itemize}
240\end{frame}
241
242
243\begin{frame}[fragile]
244\frametitle{Character Class Common Subexpressions}
245\begin{itemize}
246\item Multiple definitions use common subexpressions.
247\item CCC({\tt cc\_z9 = [z9]})
248\item 
249\begin{semiverbatim}
250temp12 = (bit4 &~ bit5)
251temp13 = (temp12 & temp5)
252temp14 = (temp9 & temp13)
253temp15 = (temp1 & temp8)
254temp16 = (bit6 &~ bit7)
255temp17 = (temp12 & temp16)
256temp18 = (temp15 & temp17)
257cc\_z9 = (temp14 | temp18)
258\end{semiverbatim}
259\end{itemize}
260\end{frame}
261
262
263\begin{frame}[fragile]
264\frametitle{Marker Streams}
265\begin{itemize}
266\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
267\item Each marker stream $M_i$ has one bit for every input byte in the input file.
268\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
269including item $i$ in the expression occurs at position $j-1$ in the input stream.
270\item Conceptually, marker streams are computed in parallel for all positions in the file
271at once (bitwise data parallelism).
272\item In practice, marker streams are computed block-by-block, where the
273block size is the size of a SIMD register in bits.
274\end{itemize}
275\end{frame}
276
277\begin{frame}[fragile]
278\frametitle{Marker Stream Example}
279\begin{itemize}[<+->]
280\item Consider matching regular expression \verb`a[0-9]*[z9]` against the input text below.
281\item<2->$M_1$ marks positions after occurrences of \verb:a:.
282\item<3->$M_2$ marks positions after occurrences of \verb:a[0-9]*:.
283\item<4->$M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
284\end{itemize}
285
286\begin{tabular}{cr}\\
287input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
288\onslide<2->$M_1$ & \verb`.1...........1...1.........1.....`\\
289\onslide<3->$M_2$ & \verb`.1111........1...111111....111...`\\
290\onslide<4->$M_3$ & \verb`.....1........1.....1.11......1..`
291\end{tabular}
292\end{frame}
293
294
295
296\begin{frame}[fragile]
297\frametitle{Matching Character Class Repetitions with MatchStar}
298\begin{itemize}[<+->]
299\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C)  \oplus C) \vee M$
300\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
301\item<4->Use addition to scan each marker through the class.
302\item<5->Bits that change represent matches.
303\item<6->We also have matches at start positions in $M_1$.
304\end{itemize}
305
306\begin{tabular}{cr}\\
307\onslide<3->input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
308\onslide<3->$M_1$ & \verb`.1...........1...1.........1.....`\\
309\onslide<3->$C = \text{\tt [0-9]}$ & \verb`.111....1........11111.....11.1..`\\
310\onslide<4->$T_0 = M_1 \wedge C$ & \verb`.1...............1.........1.....`\\
311\onslide<4->$T_1 = T_0 + C$ & \verb`....1...1.............1......11..`\\
312\onslide<5->$T_2 = T_1 \oplus C$ & \verb`.1111............111111....111...`\\
313\onslide<6->$M_2 = T_2 \vee M_1$ & \verb`.1111........1...111111....111...`
314\end{tabular}
315\end{frame}
316
317
318\begin{frame}[fragile]
319\frametitle{Matching Equations}
320The rules for bitwise data parallel regular expression matching can
321be summarized by these equations.
322\begin{eqnarray*}
323\mbox{Match}(m, C) & = &  \mbox{Advance}(\mbox{CharClass}(C) \wedge m) \\
324\mbox{Match}(m, RS) & = &  \mbox{Match}(\mbox{Match}(m, R), S) \\
325\mbox{Match}(m, R|S) & = &  \mbox{Match}(m, R) \vee \mbox{Match}(m, S)) \\
326\mbox{Match}(m, C*) & = &  \mbox{MatchStar}(m, \mbox{CharClass}(C)) \\
327\mbox{Match}(m, R*) & = &  m \vee \mbox{Match}(\mbox{Match}(m, R), R*) \\
328\mbox{Advance}(m) & = & m+m \\
329\mbox{MatchStar}(m, C) & = & (((m \wedge C) + C)  \oplus C) \vee m
330\end{eqnarray*}
331The recursive equation is implemented with a while loop.
332\end{frame}
333
334\begin{frame}[fragile]
335\frametitle{Simple UTF-8 Matching Operations}
336
337
338\begin{CJK}{UTF8}{gbsn}
339\begin{itemize}
340\item Search for 䜠奜 in UTF-8 text.
341\item Let ni3 represent the three byte sequence for 䜠.
342\item Let hao represent the three byte sequence for 奜。
343\item Let men represent the three byte sequence for 们。
344\end{itemize}
345\end{CJK}
346\begin{center}
347%\begin{tabular}{cr}\\
348\begin{tabular}{c@{\hspace{1em}}r}\\
349input data                                                         & \verb`ni3hao(Hello),ni3men(You),`\\
350$\text{CC}_{\text{ni3}}$                                           & \verb`..1.............1.........`\\
351$\text{CC}_{\text{hao}}$                                           & \verb`.....1....................`\\
352$m_0 = \mbox{\rm Initial}$                                         & \verb`1..1..111111111..1..111111`\\
353NonFinal                                                           & \verb`11.11.........11.11.......`\\
354$t_1 = \text{ScanThru}(m_0, \text{NonFinal})$                      & \verb`..1..111111111..1..1111111`\\
355$m_1 = \text{Advance}(t_1 \land \text{CC}_{\text{ni3}})$           & \verb`...1.............1........`\\
356$t_2 = \text{ScanThru}(m_1, \text{NonFinal})$                      & \verb`.....1.............1......`\\
357$m_2 = \text{Advance}(t_2 \land CC_{\text{hao}})$                  & \verb`......1...................`
358\end{tabular}
359\end{center}
360
361\end{frame}
362
363\begin{frame}[fragile]
364\frametitle{If Hierarchies}
365
366\begin{itemize}
367\item Unicode property classes may contain many thousands of codepoints.
368\item Evaluating all required equations may be very expensive.
369\item However, any 128-byte segment of input will involve only a few codepoint ranges.
370\item Evaluation of property classes is embedded in if-hierarchies of successively
371finer ranges.
372\item This techique greatly reduces the number of equations evaluated for each property.
373\end{itemize}
374
375\end{frame}
376
377\section{Architecture}
378\begin{frame}
379\frametitle{Putting It All Together}
380\begin{center}
381\Huge Architecture
382\end{center}
383\end{frame}
384
385\begin{frame}[fragile]
386\frametitle{Compilation Architecture}
387\begin{center}
388
389\pgfdeclarelayer{phases}
390\pgfsetlayers{phases,main}
391\tikzfading[name=fade down,
392  top color=transparent!0, bottom color=transparent!100]
393
394% Define block styles
395%\tikzstyle{decision} = [diamond, shape aspect=2, rotate=30, draw, text width=4.5em, text badly centered, inner sep=0pt, thick]
396\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.5em, thick, font=\ttfamily\bfseries, node distance=2.5em]
397\tikzstyle{line} = [draw, ->, line width=1.4pt]
398\tikzstyle{separator} = [draw, line width=0.125em, dashed]
399\tikzset{block/.append style={execute at begin node=\footnotesize}}   
400\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
401
402    % Place nodes
403    \node [draw=none] (RE) {\RegularExpression{}};
404    \node [block, below of=RE] (REParser) {\REParser{}};
405    \node [block, below of=REParser] (RETransform) {\RegularExpression{} Transformations};   
406%    \coordinate[below of=RETransform, node distance=3em] (Point);   
407%    \node [block, left of=Point, node distance=10em] (CUCompiler) {\CodeUnitCompiler{}};
408%    \node [block, right of=Point, node distance=10em] (RECompiler) {\RegularExpressionCompiler{}};
409    \node [block, below of=RETransform] (RECompiler) {\RegularExpressionCompiler{}};
410    \node [block, below of=RECompiler] (PabloTransform) {\Pablo{} Transformations};   
411    \node [block, below of=PabloTransform] (PabloCompiler) {\PabloCompiler{}};
412    \node [block, below of=PabloCompiler] (LLVMCompiler) {LLVM Compiler};
413    \node [draw=none, below of=LLVMCompiler, node distance=3.5em] (Matcher) {Dynamically-Generated Match Function};
414   
415    % Draw edges
416    \path [line] (RE) -- (REParser);
417    %\path [line] (RE) -- (PropertyExtraction);
418    \path [line] (REParser) -- (RETransform);
419%    \path [line] (RETransform) -| (CUCompiler);
420%    \path [line] (RETransform) -| (RECompiler);
421%    \path [line] (CUCompiler) |- (PabloTransform);
422 %   \path [line] (RECompiler) |- (PabloTransform);
423    \path [line] (RETransform) -- (RECompiler);
424    \path [line] (RECompiler) -- (PabloTransform);
425    \path [line] (PabloTransform) -- (PabloCompiler);
426    \path [line] (PabloCompiler) -- (LLVMCompiler);
427    \path [line] (LLVMCompiler) -- (Matcher);
428   
429    % Draw layers
430    \coordinate[right of=REParser, node distance=15em] (SR);
431    \coordinate[left of=REParser, node distance=15em] (SL);
432    \path [separator] (SL) -- (REParser);
433    \path [separator] (REParser) -- (SR);
434   
435    \coordinate[left of=RECompiler, node distance=15em] (PL);
436    \coordinate[right of=RECompiler, node distance=15em] (PR);
437    \path [separator] (PL) -- (RECompiler);
438    \path [separator] (RECompiler) -- (PR);
439    %\path [separator] (PL) -- (CUCompiler);
440%    \path [separator] (CUCompiler) -- (RECompiler);
441    %\path [separator] (RECompiler) -- (PR);
442
443    \coordinate[right of=PabloCompiler, node distance=15em] (LR);
444    \coordinate[left of=PabloCompiler, node distance=15em] (LL);
445    \path [separator] (LL) -- (PabloCompiler);
446    \path [separator] (PabloCompiler) -- (LR);   
447   
448    \coordinate[right of=LLVMCompiler, node distance=15em] (OR);
449    \coordinate[left of=LLVMCompiler, node distance=15em] (OL);
450    \path [separator] (OL) -- (LLVMCompiler);
451    \path [separator] (LLVMCompiler) -- (OR);       
452   
453    % Seperator text
454    \node [draw=none,anchor=west] at ($(SL)!0.5!(PL)$) {1)~\RegularExpression{} AST};
455    \node [draw=none,anchor=west] at ($(PL)!0.5!(LL)$) {2)~\Pablo{}};
456    \node [draw=none,anchor=west] at ($(LL)!0.5!(OL)$) {3)~LLVM};
457
458    \begin{pgfonlayer}{phases}
459        \path[fill=green!20,path fading=fade down, draw=none] (SL) rectangle (PR);
460        \path[fill=blue!20,path fading=fade down, draw=none] (PL) rectangle (LR);
461        \path[fill=red!20,path fading=fade down, draw=none] (LL) rectangle (OR);
462    \end{pgfonlayer}
463
464\end{tikzpicture}
465
466\end{center}
467\end{frame}
468
469\begin{frame}[fragile]
470\frametitle{Execution Architecture}
471
472\begin{center}
473
474\pgfdeclarelayer{threads}
475\pgfdeclarelayer{components}
476\pgfsetlayers{threads,main}
477
478\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
479\tikzstyle{line} = [draw, ->, line width=1.4pt]
480\tikzstyle{separator} = [draw, line width=0.125em, dashed]
481\tikzstyle{thread} = [rectangle, corners=rounded, draw, text width=15em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
482\tikzset{block/.append style={execute at begin node=\footnotesize}}   
483\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
484
485    % Place nodes
486    \node [draw=none] (InputData) {Streaming Input Data};
487    \node [block, below of=InputData] (S2P) {Transposition};
488    \node [block, below of=S2P] (RequiredStreamsGenerator) {Required Streams Generator};
489    \node [block, below of=RequiredStreamsGenerator] (JITFunction) {Dynamic Matcher};
490    \node [block, right of=JITFunction, node distance=16em] (NamedPropertyLibrary) {Named Property Library};
491    \node [block, below of=JITFunction] (MatchScanner) {Match Scanner};
492    \node [draw=none, below of=MatchScanner, node distance=3.5em] (OutputResult) {Streaming Output Result};
493   
494    % Draw edges
495    \path [line] (InputData) -- (S2P);
496    \path [line] (S2P) -- (RequiredStreamsGenerator);
497    \path [line] (RequiredStreamsGenerator) -- (JITFunction);
498    \path [line] (NamedPropertyLibrary) -- (JITFunction);
499    \path [line] (JITFunction) -- (MatchScanner);
500    \path [line] (MatchScanner) -- (OutputResult);
501   
502    \begin{pgfonlayer}{threads}
503        \path (S2P.north west)+(-.1,.5) node (a) {};
504        \path (S2P.south east)+(+1.7,-0.1) node (b) {};
505        \path[fill=green!20,rounded corners, draw=green, solid] (a) rectangle (b);
506        \node [draw=none,above=-0.04cm of S2P.north east] (t1) {\small Transposition Thread};
507
508        \path (RequiredStreamsGenerator.north west)+(-.1,.38) node (a) {};
509        \path (RequiredStreamsGenerator.south east)+(+2,-0.1) node (b) {};
510        \path[fill=blue!20,rounded corners, draw=blue, solid] (a) rectangle (b);
511        \node [draw=none,above=-0.04cm of RequiredStreamsGenerator.north east] (t1) {\small Stream Generator Thread};
512
513        \path (JITFunction.north west)+(-.1,.38) node (a) {};
514        \path (NamedPropertyLibrary.east |- MatchScanner.south)+(+.1,-0.1) node (b) {};
515        \path[fill=red!20,rounded corners, draw=red, solid] (a) rectangle (b);
516        \node [draw=none,above=-0.04cm of JITFunction.north east] (t1) {\small Matcher Thread};
517
518    \end{pgfonlayer}{threads}
519\end{tikzpicture}
520
521\end{center}
522\end{frame}
523
524
525\begin{frame}
526\frametitle{Performance Study Set-up}
527\begin{itemize}
528\item Search for lines containing characters in computed sets.
529\item Set expressions combine general category and script properties.
530\item Set intersection, union, difference, negation.
531\item Cover all general categories and scripts.
532\item 246 test expressions in all.
533\item Evaluate icgrep 1.0 vs. ugrep541 and pcre2grep.
534\item Measure performance against a large collection of Wikimedia
535  documents.
536\end{itemize}
537\end{frame}
538
539\begin{frame}
540\frametitle{Performance Comparison}
541\begin{center}
542\pgfplotstableread[col sep = comma]{data/icgrep-scatter.csv}\icgrep
543\pgfplotstableread[col sep = comma]{data/ugrep541-scatter.csv}\ugrep
544\pgfplotstableread[col sep = comma]{data/pcre2grep-scatter.csv}\pcre
545
546\begin{tikzpicture}
547\begin{axis}[
548grid=both,
549x tick label style={/pgf/number format/1000 sep=},
550ylabel={CPU Cycles Per Byte},
551xlabel={Percentage of Matching Lines},
552minor y tick num={1},
553xmax=100,
554height=0.5\textwidth,
555legend style={at={(1.05,0.5)},
556anchor=west,legend columns=1,
557align=left,draw=none,column sep=2ex}
558]
559\addplot+[no markers,line width=2pt,color=blue!60,solid] table {\icgrep};
560\addplot+[no markers,line width=2pt,color=red!60,solid] table {\ugrep};
561\addplot+[no markers,line width=2pt,color=brown!60,solid] table {\pcre};
562\legend{icgrep1.0,ugrep541,pcre2grep}
563\end{axis}
564
565
566
567
568\end{tikzpicture}
569\end{center}
570%\caption{Matching Performance for Simple Property Expressions}\label{fig:property_test}
571\end{frame}
572
573\begin{frame}
574\frametitle{Performance Comparison (Multithread)}
575
576\begin{itemize}
577\item Pipeline parallelism
578\item Up to 40\% speedup by hiding transposition and required stream generation.
579\item Combining the AVX2 ISA with multithreading gives and average overall 61\%
580\item Future work
581\begin{itemize} 
582\item Pipeline + data parallelism
583\item AVX-512 ISA
584\end{itemize}
585\end{itemize}
586\end{frame}
587
588\section{Conclusion}
589\begin{frame}[fragile]
590\frametitle{Results}
591\begin{itemize}%[<+->]
592\item Regular expression matching using bitwise data parallelism is well-suited
593to Unicode regular expression matching requirements.
594
595\item Performance is ultra-fast and robust against all kinds of nondeterminstic
596regular expressions.
597
598\item Performance improves as SIMD processor width increases.
599
600
601\item Open source implementation available: {\tt http://parabix.costar.sfu.ca/wiki/ICgrep}
602\end{itemize}
603
604\end{frame}
605
606
607\end{document}
Note: See TracBrowser for help on using the repository browser.