source: docs/Working/icGrep/pres.tex @ 4875

Last change on this file since 4875 was 4875, checked in by cameron, 4 years ago

Final version of presentation at ICA3PP

File size: 21.7 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{comment}
4\usepackage{color}
5\usepackage{subfigure}
6\usepackage[T1]{fontenc} 
7\usepackage[utf8]{inputenc}
8\usepackage{CJK}
9\usepackage{verbatim}
10\usepackage{colortbl}
11\usepackage{tikz}
12\usetikzlibrary{shapes,positioning,arrows,calc,fadings}
13\usepackage{pgfplots}
14\usepackage{pgfplotstable}
15%\pgfplotsset{compat=1.10}
16
17\usepackage{bbding}
18
19\newcommand{\icGrep}[1]{icGrep}
20
21\def\RegularExpression{RegEx}
22\def\Pablo{Parabix}
23\def\CodeUnit{Code Unit}
24\def\REParser{\RegularExpression{} Parser}
25\def\CodeUnitCompiler{\CodeUnit{} Compiler}
26\def\RegularExpressionCompiler{\RegularExpression{} Compiler}
27\def\PabloCompiler{\Pablo{} Compiler}
28
29\usetheme{Madrid}
30
31
32\title[ICA3PP 2015]{Bitwise Data Parallelism with LLVM: The ICgrep Case Study}
33\author[Cameron et al]{Robert D. Cameron\and Nigel Medforth\and Dan Lin
34\and Dale Denis
35\and William N. Sumner}
36\institute[SFU]{School of Computing Science\\
37Simon Fraser University}
38\date{November 18, 2015}
39
40\begin{document}
41
42\begin{frame}
43\titlepage
44\end{frame}
45
46
47%\begin{frame}
48%\frametitle{Outline}
49%\tableofcontents%[pausesections]
50%\end{frame}
51
52\section{Motivation}
53\begin{frame}[fragile]
54\frametitle{Overview}
55\begin{block}<+->{The Parabix Regular Expression Project}
56
57\begin{itemize}
58\item Goal: building a modern, ultra-fast, Unicode-compliant, regular expression engine and
59  application framework.
60\item New algorithmic approach: bitwise data parallel matching.
61\item Performance target: Gigabyte/second regexp processing.
62\item Status: icgrep 1.0 is a full Unicode Level 1 grep replacement.
63\item Current work: Unicode Level 2 Regular Expression Support
64\item Longer-term: high-level application framework.
65\end{itemize}
66\end{block}
67\end{frame}
68
69
70\begin{frame}
71\frametitle{Performance Matters - Big Text}
72\begin{block}<+->{Text Analytics}
73\begin{itemize}
74\item 
75Text Analytics applications: very large textual data
76sets.
77\item Gigabyte to Terabyte scale.
78\item Regular expressions are often used in these applications.
79\end{itemize}
80\end{block}
81\begin{block}<+->{IBM System T Study}
82\begin{itemize}
83\item 
84Five customer workloads studied
85\item
86Four of five workloads dominated by regexp processing.
87\item 
8860\%-75\% of total execution time.
89\item
90Polig {\it et al.} 
91Giving text analytics a boost. {\em IEEE Micro}, {\bf 34},
922014.
93\end{itemize}
94\end{block}
95\end{frame}
96
97\begin{frame}[fragile]
98
99\frametitle{Much Better Performance is Possible}
100Unicode regular expression matching can be ultra-fast and robust.
101
102\begin{block}<+->{Example: email regex with Unicode properties}
103\begin{itemize}
104\item 
105\verb:([^\p{Z}<]+@[\p{L}\p{M}\p{N}.-]+\.:
106\verb:      (\p{L}\p{M}*){2,6})(>|\p{Z}|$):
107\item Compare time in CPU cycles per byte for 4 grep programs.
108\item Data source: 620 MB Wikibooks document set (15 languages)
109\item Processor: Intel Core i7-2600 CPU @ 3.40GHz
110\end{itemize}
111\end{block}
112
113%\item ugrep541: 549.5 cycles/byte
114%\item pcre2grep: 281.7 cycles/byte
115%\item icgrep4750: 4.2 cycles/byte
116%\item Speedup: 67X-130X reaches 810 MB/s
117\begin{block}<+->{Performance Results}
118\begin{center}
119\begin{tabular}{|c|r|r|r|r|} \hline
120Program & Elapsed sec & Cycles/Byte & Megabytes/sec \\ \hline
121ugrep561 &      180.7   & 1024.0 &      3.60 \\
122pcre2grep & 144.0       & 815.5 & 4.51  \\
123grep210  -P & 87.3      & 498.2 & 7.45 \\
124icgrep4750 &1.06 &      6.05 & 613.0 \\ \hline
125\end{tabular}
126\end{center}
127\end{block}
128\end{frame}
129
130
131\section{Bitwise Data Parallel Regular Expression Matching}
132\begin{frame}
133\frametitle{How It Works}
134\begin{center}
135\Huge Bitwise Data Parallel Regular Expression Matching
136\end{center}
137\end{frame}
138
139\begin{frame}[fragile]
140\frametitle{Beyond Byte-At-A-Time}
141\begin{itemize}%[<+->]
142\item Traditional regular expression technology processes one code
143  unit at a time using DFA, NFA or backtracking implementations.
144\item Instead consider a bitwise data parallel approach.
145\item Byte-oriented data is first transformed to 8 parallel bit
146  streams. Bit stream $j$ consists of bit $j$ of each byte.
147\end{itemize}
148
149\begin{tabular}{cl}\\
150\onslide<3->input data  & \verb`a45a... (01100001 00110100 00110101 01100001...)`\\
151\onslide<4->$bit0$ & \verb`0000...`\\
152\onslide<5->$bit1$ & \verb`1001...`\\
153\onslide<6->$bit2$ & \verb`1111...`\\
154\onslide<6->$bit3$ & \verb`0110...`\\
155\onslide<6->$bit4$ & \verb`0000...`\\
156\onslide<6->$bit5$ & \verb`0110...`\\
157\onslide<6->$bit6$ & \verb`0000...`\\
158\onslide<6->$bit7$ & \verb`1011...`
159\end{tabular}
160\end{frame}
161
162
163\begin{frame}
164\frametitle{Beyond Byte-At-A-Time}
165\begin{itemize}
166\item Load 128-bit SIMD registers to process 128 positions at a time
167in bitwise data parallel fashion (SSE2, ARM Neon, ...).
168\item Or use 256-bit AVX2 registers of newer Intel processors.
169\item Process using bitwise logic, shifting and addition.
170\item Parabix methods have previously been used to accelerate
171Unicode transcoding and XML parsing.
172\end{itemize}
173
174\end{frame}
175
176
177\begin{frame}[fragile]
178\frametitle{Character Class Formation}
179\begin{itemize}
180\item Combining 8 bits of a code unit gives a character class stream.
181\item CCC({\tt cc\_a = [a]})
182\item 
183\begin{semiverbatim}
184temp1 = (bit1 &~ bit0)
185temp2 = (bit2 &~ bit3)
186temp3 = (temp1 & temp2)
187temp4 = (bit4 | bit5)
188temp5 = (bit7 &~ bit6)
189temp6 = (temp5 &~ temp4)
190cc\_a = (temp3 & temp6)
191\end{semiverbatim}
192\end{itemize}
193
194\begin{tabular}{cl}\\
195input data  & \verb`a45a...`\\
196$cc\_a$ & \verb`1001...`\\
197\end{tabular}
198
199\end{frame}
200
201\begin{frame}[fragile]
202\frametitle{Character Class Ranges}
203\begin{itemize}
204\item Ranges of characters are often very simple to compute.
205\item CCC({\tt cc\_0\_9 = [0-9]})
206\item 
207\begin{semiverbatim}
208temp7 = (bit0 | bit1)
209temp8 = (bit2 & bit3)
210temp9 = (temp8 &~ temp7)
211temp10 = (bit5 | bit6)
212temp11 = (bit4 & temp10)
213cc\_0\_9 = (temp9 &~ temp11)
214\end{semiverbatim}
215\end{itemize}
216\end{frame}
217
218
219\begin{frame}[fragile]
220\frametitle{Character Class Common Subexpressions}
221\begin{itemize}
222\item Multiple definitions use common subexpressions.
223\item CCC({\tt cc\_z9 = [z9]})
224\item 
225\begin{semiverbatim}
226temp12 = (bit4 &~ bit5)
227temp13 = (temp12 & temp5)
228temp14 = (temp9 & temp13)
229temp15 = (temp1 & temp8)
230temp16 = (bit6 &~ bit7)
231temp17 = (temp12 & temp16)
232temp18 = (temp15 & temp17)
233cc\_z9 = (temp14 | temp18)
234\end{semiverbatim}
235\end{itemize}
236\end{frame}
237
238
239\begin{frame}[fragile]
240\frametitle{Marker Streams}
241\begin{itemize}
242\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
243\item Each marker stream $M_i$ has one bit for every input byte in the input file.
244\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
245including item $i$ in the expression occurs at position $j-1$ in the input stream.
246\item Conceptually, marker streams are computed in parallel for all positions in the file
247at once (bitwise data parallelism).
248\item In practice, marker streams are computed block-by-block, where the
249block size is the size of a SIMD register in bits.
250\end{itemize}
251\end{frame}
252
253\begin{frame}[fragile]
254\frametitle{Marker Stream Example}
255\begin{itemize}%[<+->]
256\item Consider matching regular expression \verb`a[0-9]*[z9]` against the input text below.
257\item $M_1$ marks positions after occurrences of \verb:a:.
258\item $M_2$ marks positions after occurrences of \verb:a[0-9]*:.
259\item $M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
260\end{itemize}
261
262\begin{tabular}{cr}\\
263input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
264$M_1$ & \verb`.1...........1...1.........1.....`\\
265$M_2$ & \verb`.1111........1...111111....111...`\\
266$M_3$ & \verb`.....1........1.....1.11......1..`
267\end{tabular}
268\end{frame}
269
270
271
272\begin{frame}[fragile]
273\frametitle{Matching Character Class Repetitions with MatchStar}
274\begin{itemize}%[<+->]
275\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C)  \oplus C) \vee M$
276\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
277\item<4->Use addition to scan each marker through the class.
278\item<5->Bits that change represent matches.
279\item<6->We also have matches at start positions in $M_1$.
280\end{itemize}
281
282\begin{tabular}{cr}\\
283\onslide<3->input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
284\onslide<3->$M_1$ & \verb`.1...........1...1.........1.....`\\
285\onslide<3->$C = \text{\tt [0-9]}$ & \verb`.111....1........11111.....11.1..`\\
286\onslide<4->$T_0 = M_1 \wedge C$ & \verb`.1...............1.........1.....`\\
287\onslide<4->$T_1 = T_0 + C$ & \verb`....1...1.............1......11..`\\
288\onslide<5->$T_2 = T_1 \oplus C$ & \verb`.1111............111111....111...`\\
289\onslide<6->$M_2 = T_2 \vee M_1$ & \verb`.1111........1...111111....111...`
290\end{tabular}
291\end{frame}
292
293
294\begin{frame}[fragile]
295\frametitle{Matching Equations}
296The rules for bitwise data parallel regular expression matching can
297be summarized by these equations.
298\begin{eqnarray*}
299\mbox{Match}(m, C) & = &  \mbox{Advance}(\mbox{CharClass}(C) \wedge m) \\
300\mbox{Match}(m, RS) & = &  \mbox{Match}(\mbox{Match}(m, R), S) \\
301\mbox{Match}(m, R|S) & = &  \mbox{Match}(m, R) \vee \mbox{Match}(m, S)) \\
302\mbox{Match}(m, C*) & = &  \mbox{MatchStar}(m, \mbox{CharClass}(C)) \\
303\mbox{Match}(m, R*) & = &  m \vee \mbox{Match}(\mbox{Match}(m, R), R*) \\
304\mbox{Advance}(m) & = & m+m \\
305\mbox{MatchStar}(m, C) & = & (((m \wedge C) + C)  \oplus C) \vee m
306\end{eqnarray*}
307The recursive equation is implemented with a while loop.
308\end{frame}
309
310
311\section{Unicode Regular Expression Matching with Parabix}
312\begin{frame}
313\frametitle{Unicode}
314\begin{center}
315\Huge UTF-8 Regular Expression Matching
316\end{center}
317\end{frame}
318
319\begin{frame}[fragile]
320\frametitle{UTF-8 Byte Classification and Validation}
321
322\begin{eqnarray*}
323\mbox{ASCII} & = & \mbox{CharClass}(\verb:[\x00-\x7F]:) \\
324\mbox{Prefix} & = & \mbox{CharClass}(\verb:[\xC2-\F4]:) \\
325\mbox{Pfx3or4} & = & \mbox{CharClass}(\verb:[\xE0-\xF4]:) \\
326\mbox{Prefix4} & = & \mbox{CharClass}(\verb:[\xF0-\xF4]:) \\
327\mbox{Suffix} & = & \mbox{CharClass}(\verb:[\x80-\xBF]:) \\
328\mbox{Scope} & = & \mbox{Advance}(\mbox{Prefix}) \vee \mbox{Advance}(\mbox{Pfx3or4},2) \vee \mbox{Advance}(\mbox{Prefix4}, 3) \\
329\mbox{Mismatch} & = & \mbox{Scope} \oplus \mbox{Suffix} \\
330\mbox{Initial} & = & \mbox{ASCII} \vee \mbox{Prefix} \\
331\mbox{NonFinal} & = & \mbox{Prefix} \vee \mbox{Advance}(\mbox{Pfx3or4}) \vee \mbox{Advance}(\mbox{Prefix4}, 2)
332\end{eqnarray*}
333\end{frame}
334
335
336\begin{frame}[fragile]
337\frametitle{UTF-8 Matching Operations}
338
339Advance and Matchstar can both be adapted for UTF-8 matching.
340Let U8Class($U$) be the stream marking the {\em final} UTF-8 byte position of
341occurrences of members of the class.
342
343\begin{eqnarray*}
344\mbox{Match}(m, U) & = & \mbox{Advance}(\mbox{ScanThru}(m, \mbox{NonFinal}) \wedge \mbox{U8Class}(U)) \\
345\mbox{Match}(m, U*) & = & \mbox{MatchStar}(m, \mbox{U8Class}(U) \vee \mbox{NonFinal}) \wedge \mbox{Initial}\\
346\mbox{ScanThru}(m, C) & = & (m + C) \wedge \neg C\\
347\end{eqnarray*}
348\end{frame}
349
350\begin{frame}[fragile]
351\frametitle{Simple UTF-8 Matching Operations}
352
353
354\begin{CJK}{UTF8}{gbsn}
355\begin{itemize}
356\item Search for 䜠奜 in UTF-8 text.
357\item Let ni3 represent the three byte sequence for 䜠.
358\item Let hao represent the three byte sequence for 奜。
359\item Let men represent the three byte sequence for 们。
360\end{itemize}
361\end{CJK}
362\begin{center}
363%\begin{tabular}{cr}\\
364\begin{tabular}{c@{\hspace{1em}}r}\\
365input data                                                         & \verb`ni3hao(Hello),ni3men(You),`\\
366$\text{CC}_{\text{ni3}}$                                           & \verb`..1.............1.........`\\
367$\text{CC}_{\text{hao}}$                                           & \verb`.....1....................`\\
368$m_0 = \mbox{\rm Initial}$                                         & \verb`1..1..111111111..1..111111`\\
369NonFinal                                                           & \verb`11.11.........11.11.......`\\
370$t_1 = \text{ScanThru}(m_0, \text{NonFinal})$                      & \verb`..1..111111111..1..1111111`\\
371$m_1 = \text{Advance}(t_1 \land \text{CC}_{\text{ni3}})$           & \verb`...1.............1........`\\
372$t_2 = \text{ScanThru}(m_1, \text{NonFinal})$                      & \verb`.....1.............1......`\\
373$m_2 = \text{Advance}(t_2 \land CC_{\text{hao}})$                  & \verb`......1...................`
374\end{tabular}
375\end{center}
376
377\end{frame}
378
379\begin{frame}[fragile]
380\frametitle{If Hierarchies}
381
382\begin{itemize}
383\item Unicode property classes may contain many thousands of codepoints.
384\item Evaluating all required equations may be very expensive.
385\item However, any 128-byte segment of input will involve only a few codepoint ranges.
386\item Evaluation of property classes is embedded in if-hierarchies of successively
387finer ranges.
388\item This techique greatly reduces the number of equations evaluated for each property.
389\end{itemize}
390
391\end{frame}
392
393\section{Architecture}
394\begin{frame}
395\frametitle{Putting It All Together}
396\begin{center}
397\Huge Architecture
398\end{center}
399\end{frame}
400
401\begin{frame}[fragile]
402\frametitle{Compilation Architecture}
403\begin{center}
404
405\pgfdeclarelayer{phases}
406\pgfsetlayers{phases,main}
407\tikzfading[name=fade down,
408  top color=transparent!0, bottom color=transparent!100]
409
410% Define block styles
411%\tikzstyle{decision} = [diamond, shape aspect=2, rotate=30, draw, text width=4.5em, text badly centered, inner sep=0pt, thick]
412\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.5em, thick, font=\ttfamily\bfseries, node distance=2.5em]
413\tikzstyle{line} = [draw, ->, line width=1.4pt]
414\tikzstyle{separator} = [draw, line width=0.125em, dashed]
415\tikzset{block/.append style={execute at begin node=\footnotesize}}   
416\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
417
418    % Place nodes
419    \node [draw=none] (RE) {\RegularExpression{}};
420    \node [block, below of=RE] (REParser) {\REParser{}};
421    \node [block, below of=REParser] (RETransform) {\RegularExpression{} Transformations};   
422%    \coordinate[below of=RETransform, node distance=3em] (Point);   
423%    \node [block, left of=Point, node distance=10em] (CUCompiler) {\CodeUnitCompiler{}};
424%    \node [block, right of=Point, node distance=10em] (RECompiler) {\RegularExpressionCompiler{}};
425    \node [block, below of=RETransform] (RECompiler) {\RegularExpressionCompiler{}};
426    \node [block, below of=RECompiler] (PabloTransform) {\Pablo{} Transformations};   
427    \node [block, below of=PabloTransform] (PabloCompiler) {\PabloCompiler{}};
428    \node [block, below of=PabloCompiler] (LLVMCompiler) {LLVM Compiler};
429    \node [draw=none, below of=LLVMCompiler, node distance=3.5em] (Matcher) {Dynamically-Generated Match Function};
430   
431    % Draw edges
432    \path [line] (RE) -- (REParser);
433    %\path [line] (RE) -- (PropertyExtraction);
434    \path [line] (REParser) -- (RETransform);
435%    \path [line] (RETransform) -| (CUCompiler);
436%    \path [line] (RETransform) -| (RECompiler);
437%    \path [line] (CUCompiler) |- (PabloTransform);
438 %   \path [line] (RECompiler) |- (PabloTransform);
439    \path [line] (RETransform) -- (RECompiler);
440    \path [line] (RECompiler) -- (PabloTransform);
441    \path [line] (PabloTransform) -- (PabloCompiler);
442    \path [line] (PabloCompiler) -- (LLVMCompiler);
443    \path [line] (LLVMCompiler) -- (Matcher);
444   
445    % Draw layers
446    \coordinate[right of=REParser, node distance=15em] (SR);
447    \coordinate[left of=REParser, node distance=15em] (SL);
448    \path [separator] (SL) -- (REParser);
449    \path [separator] (REParser) -- (SR);
450   
451    \coordinate[left of=RECompiler, node distance=15em] (PL);
452    \coordinate[right of=RECompiler, node distance=15em] (PR);
453    \path [separator] (PL) -- (RECompiler);
454    \path [separator] (RECompiler) -- (PR);
455    %\path [separator] (PL) -- (CUCompiler);
456%    \path [separator] (CUCompiler) -- (RECompiler);
457    %\path [separator] (RECompiler) -- (PR);
458
459    \coordinate[right of=PabloCompiler, node distance=15em] (LR);
460    \coordinate[left of=PabloCompiler, node distance=15em] (LL);
461    \path [separator] (LL) -- (PabloCompiler);
462    \path [separator] (PabloCompiler) -- (LR);   
463   
464    \coordinate[right of=LLVMCompiler, node distance=15em] (OR);
465    \coordinate[left of=LLVMCompiler, node distance=15em] (OL);
466    \path [separator] (OL) -- (LLVMCompiler);
467    \path [separator] (LLVMCompiler) -- (OR);       
468   
469    % Seperator text
470    \node [draw=none,anchor=west] at ($(SL)!0.5!(PL)$) {1)~\RegularExpression{} AST};
471    \node [draw=none,anchor=west] at ($(PL)!0.5!(LL)$) {2)~\Pablo{}};
472    \node [draw=none,anchor=west] at ($(LL)!0.5!(OL)$) {3)~LLVM};
473
474    \begin{pgfonlayer}{phases}
475        \path[fill=green!20,path fading=fade down, draw=none] (SL) rectangle (PR);
476        \path[fill=blue!20,path fading=fade down, draw=none] (PL) rectangle (LR);
477        \path[fill=red!20,path fading=fade down, draw=none] (LL) rectangle (OR);
478    \end{pgfonlayer}
479
480\end{tikzpicture}
481
482\end{center}
483\end{frame}
484
485\begin{frame}[fragile]
486\frametitle{Execution Architecture}
487
488\begin{center}
489
490\pgfdeclarelayer{threads}
491\pgfdeclarelayer{components}
492\pgfsetlayers{threads,main}
493
494\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
495\tikzstyle{line} = [draw, ->, line width=1.4pt]
496\tikzstyle{separator} = [draw, line width=0.125em, dashed]
497\tikzstyle{thread} = [rectangle, corners=rounded, draw, text width=15em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
498\tikzset{block/.append style={execute at begin node=\footnotesize}}   
499\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
500
501    % Place nodes
502    \node [draw=none] (InputData) {Streaming Input Data};
503    \node [block, below of=InputData] (S2P) {Transposition};
504    \node [block, below of=S2P] (RequiredStreamsGenerator) {Required Streams Generator};
505    \node [block, below of=RequiredStreamsGenerator] (JITFunction) {Dynamic Matcher};
506    \node [block, right of=JITFunction, node distance=16em] (NamedPropertyLibrary) {Named Property Library};
507    \node [block, below of=JITFunction] (MatchScanner) {Match Scanner};
508    \node [draw=none, below of=MatchScanner, node distance=3.5em] (OutputResult) {Streaming Output Result};
509   
510    % Draw edges
511    \path [line] (InputData) -- (S2P);
512    \path [line] (S2P) -- (RequiredStreamsGenerator);
513    \path [line] (RequiredStreamsGenerator) -- (JITFunction);
514    \path [line] (NamedPropertyLibrary) -- (JITFunction);
515    \path [line] (JITFunction) -- (MatchScanner);
516    \path [line] (MatchScanner) -- (OutputResult);
517   
518    \begin{pgfonlayer}{threads}
519        \path (S2P.north west)+(-.1,.5) node (a) {};
520        \path (S2P.south east)+(+1.7,-0.1) node (b) {};
521        \path[fill=green!20,rounded corners, draw=green, solid] (a) rectangle (b);
522        \node [draw=none,above=-0.04cm of S2P.north east] (t1) {\small Transposition Thread};
523
524        \path (RequiredStreamsGenerator.north west)+(-.1,.38) node (a) {};
525        \path (RequiredStreamsGenerator.south east)+(+2,-0.1) node (b) {};
526        \path[fill=blue!20,rounded corners, draw=blue, solid] (a) rectangle (b);
527        \node [draw=none,above=-0.04cm of RequiredStreamsGenerator.north east] (t1) {\small Stream Generator Thread};
528
529        \path (JITFunction.north west)+(-.1,.38) node (a) {};
530        \path (NamedPropertyLibrary.east |- MatchScanner.south)+(+.1,-0.1) node (b) {};
531        \path[fill=red!20,rounded corners, draw=red, solid] (a) rectangle (b);
532        \node [draw=none,above=-0.04cm of JITFunction.north east] (t1) {\small Matcher Thread};
533
534    \end{pgfonlayer}{threads}
535\end{tikzpicture}
536
537\end{center}
538\end{frame}
539
540
541\begin{frame}
542\frametitle{Performance Study Set-up}
543\begin{itemize}
544\item Search for lines containing characters in computed sets.
545\item Set expressions combine general category and script properties.
546\item Set intersection, union, difference, negation.
547\item Cover all general categories and scripts.
548\item 246 test expressions in all.
549\item Evaluate icgrep 1.0 vs. ugrep541 and pcre2grep.
550\item Measure performance against a large collection of Wikimedia
551  documents.
552\end{itemize}
553\end{frame}
554
555\begin{frame}
556\frametitle{Performance Comparison}
557\begin{center}
558\pgfplotstableread[col sep = comma]{data/icgrep-scatter.csv}\icgrep
559\pgfplotstableread[col sep = comma]{data/ugrep541-scatter.csv}\ugrep
560\pgfplotstableread[col sep = comma]{data/pcre2grep-scatter.csv}\pcre
561
562\begin{tikzpicture}
563\begin{axis}[ 
564grid=both,
565x tick label style={/pgf/number format/1000 sep=},
566ylabel={CPU Cycles Per Byte},
567xlabel={Percentage of Matching Lines},
568minor y tick num={1},
569xmax=100,
570height=0.5\textwidth,
571legend style={at={(1.05,0.5)},
572anchor=west,legend columns=1,
573align=left,draw=none,column sep=2ex}
574]
575\addplot+[no markers,line width=2pt,color=blue!60,solid] table {\icgrep};
576\addplot+[no markers,line width=2pt,color=red!60,solid] table {\ugrep};
577\addplot+[no markers,line width=2pt,color=brown!60,solid] table {\pcre};
578\legend{icgrep1.0,ugrep541,pcre2grep}
579\end{axis}
580
581
582
583
584\end{tikzpicture}
585\end{center}
586%\caption{Matching Performance for Simple Property Expressions}\label{fig:property_test}
587\end{frame}
588
589\begin{frame}
590\frametitle{Performance Comparison (Multithread)}
591
592\begin{itemize}
593\item Pipeline parallelism
594\item Up to 40\% speedup by hiding transposition and required stream generation.
595\item Combining the AVX2 ISA with multithreading gives and average overall 61\%
596\item Future work
597\begin{itemize}
598\item Pipeline + data parallelism
599\item AVX-512 ISA
600\end{itemize}
601\end{itemize}
602\end{frame}
603
604\section{Conclusion}
605\begin{frame}[fragile]
606\frametitle{Conclusion}
607\begin{block}<+->{Results}
608
609\begin{itemize}%[<+->]
610\item Bitwise data parallelism is well-suited
611to Unicode regular expression matching requirements.
612
613\item Dramatic speed-up over sequential methods: often 10X or better.
614
615\item Performance improves as SIMD processor width increases.
616
617\item Further performance enhancement with multithreading: pipeline,
618task and/or coarse-grained data parallelism.
619
620\end{itemize}
621\end{block}
622\begin{block}<+->{icGrep 1.0}
623
624\begin{itemize}%[<+->]
625\item Complete modern grep with full Unicode Level 1 support.
626\item Open source: {\tt
627    http://parabix.costar.sfu.ca/wiki/ICgrep}
628\item Many options for teaching and research.
629\item Download it, use it and give us your feedback!
630\end{itemize}
631\end{block}
632
633\end{frame}
634
635
636\end{document}
Note: See TracBrowser for help on using the repository browser.