source: docs/IUC39/iuc39pres.tex @ 4855

Last change on this file since 4855 was 4855, checked in by cameron, 3 years ago

IUC39 presentation

File size: 24.8 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{comment}
4\usepackage{color}
5\usepackage{subfigure}
6\usepackage[T1]{fontenc} 
7\usepackage[utf8]{inputenc}
8\usepackage{CJK}
9\usepackage{verbatim}
10\usepackage{colortbl}
11\usepackage{tikz}
12\usetikzlibrary{shapes,positioning,arrows,calc,fadings}
13\usepackage{pgfplots}
14\usepackage{pgfplotstable}
15%\pgfplotsset{compat=1.10}
16
17\usepackage{bbding}
18
19\newcommand{\icGrep}[1]{icGrep}
20
21\def\RegularExpression{RegEx}
22\def\Pablo{Parabix}
23\def\CodeUnit{Code Unit}
24\def\REParser{\RegularExpression{} Parser}
25\def\CodeUnitCompiler{\CodeUnit{} Compiler}
26\def\RegularExpressionCompiler{\RegularExpression{} Compiler}
27\def\PabloCompiler{\Pablo{} Compiler}
28
29\usetheme{Madrid}
30
31
32\title[International Unicode Conference 39]{Performance Matters---A New Algorithmic Approach for Fast Unicode Regular Expression Processing}
33\author[Rob Cameron]{Robert D. Cameron}
34\institute[SFU]{School of Computing Science\\
35Simon Fraser University}
36\date{October 28, 2015}
37
38\begin{document}
39
40\begin{frame}
41\titlepage
42\end{frame}
43
44
45%\begin{frame}
46%\frametitle{Outline}
47%\tableofcontents%[pausesections]
48%\end{frame}
49
50\section{Motivation}
51\begin{frame}[fragile]
52\frametitle{Overview}
53\begin{block}<+->{The Parabix Regular Expression Project}
54
55\begin{itemize}
56\item Goal: building a modern, Unicode-compliant, ultra-fast, regular expression engine and
57  application framework.
58\item New algorithmic approach: bitwise data parallel matching.
59\item Performance target: Gigabyte/second regexp processing.
60\item Status: icgrep 1.0 is a full Unicode Level 1 grep replacement.
61\item Current work: Unicode Level 2 Regular Expression Support
62\item Longer-term: high-level application framework.
63\end{itemize}
64\end{block}
65\end{frame}
66
67
68\begin{frame}[fragile]
69\frametitle{UTS \#18 - Unicode Regular Expressions}
70\begin{block}<+->{Unicode Level 1}
71\begin{itemize}
72\item General category (e.g., \verb:\p{Lu}:), script (e.g. \verb:\p{Greek}:), and other core Unicode properties.
73\item Simple Unicode line breaks, word boundaries.
74\item Set operations, e.g., \verb:[\p{Greek}&&\p{Lu}]:
75\item Simple Unicode case-insensitive matching.
76\end{itemize}
77\end{block}
78\begin{block}<+->{Unicode Level 2}
79\begin{itemize}
80\item Grapheme clusters and grapheme cluster mode.
81\item Canonical equivalents.
82\item Name property with regexp values \verb:\p{name=/AIRPLANE/}:
83\item Broad set of Unicode general, numeric,  identifier, case, normalization,
84  shaping, bidirectional, CJK and  other properties.
85\end{itemize}
86\end{block}
87\end{frame}
88
89
90\begin{frame}
91\frametitle{Performance Matters - Big Text}
92\begin{block}<+->{Text Analytics}
93\begin{itemize}
94\item 
95Text Analytics applications: very large textual data
96sets.
97\item Gigabyte to Terabyte scale.
98\item Regular expressions are often used in these applications.
99\item If performance were better, regular expressions could be applied
100  more widely.
101\end{itemize}
102\end{block}
103\begin{block}<+->{IBM System T Study}
104\begin{itemize}
105\item 
106Five customer workloads studied
107\item
108Four of five workloads dominated by regexp processing.
109\item 
11060\%-75\% of total execution time.
111\item
112Polig {\it et al.} 
113Giving text analytics a boost. {\em IEEE Micro}, {\bf 34}(4):6--14,
1142014.
115\end{itemize}
116\end{block}
117\end{frame}
118\begin{frame}[fragile]
119\frametitle{Performance Matters - Catastrophic Backtracking}
120\begin{block}<+->{Nondeterministic Regular Expression Features}
121\begin{itemize}
122\item Ambiguity: regexes may match strings in multiple ways.
123\item Ex: match(\verb:^(\p{L})+t(\p{L})+$:, ``\verb:statistics:'')
124\item (\verb:s:, \verb:atistics:)(\verb:sta:, \verb:istics:), or
125  (\verb:statis:, \verb:ics:)
126\item Backtracking engines: potential runaway behaviour.
127\item Procedural workarounds:  \verb:^(\p{L})++t(\p{L})+$:  or \verb:^(?>(\p{L})+?)t(\p{L})+$:
128\item Declarative nature of regexps diminished.
129\end{itemize}
130\end{block}
131
132\begin{block}<+->{Web Site Security}
133\begin{itemize}
134\item Regular expressions may be used in web services.
135\item Malicious agents may inject ill-formed regular expressions.
136\item Web sites may be tied up (denial-of-service) or compromised.
137\end{itemize}
138\end{block}
139\end{frame}
140
141\begin{frame}[fragile]
142
143\frametitle{Much Better Performance is Possible}
144Unicode regular expression matching can be ultra-fast and robust.
145
146\begin{block}<+->{Example: email regex with Unicode properties}
147\begin{itemize}
148\item 
149\verb:([^\p{Z}<]+@[\p{L}\p{M}\p{N}.-]+\.:
150\verb:      (\p{L}\p{M}*){2,6})(>|\p{Z}|$):
151\item Compare time in CPU cycles per byte for 4 grep programs.
152\item Data source: 620 MB Wikibooks document set (15 languages)
153\item Processor: Intel Core i7-2600 CPU @ 3.40GHz
154\end{itemize}
155\end{block}
156
157%\item ugrep541: 549.5 cycles/byte
158%\item pcre2grep: 281.7 cycles/byte
159%\item icgrep4750: 4.2 cycles/byte
160%\item Speedup: 67X-130X reaches 810 MB/s
161\begin{block}<+->{Performance Results}
162\begin{center}
163\begin{tabular}{|c|r|r|r|r|} \hline
164Program & Elapsed sec & Cycles/Byte & Megabytes/sec \\ \hline
165ugrep561 &      180.7   & 1024.0 &      3.60 \\
166pcre2grep & 144.0       & 815.5 & 4.51  \\
167grep210  -P & 87.3      & 498.2 & 7.45 \\
168icgrep4750 &1.06 &      6.05 & 613.0 \\ \hline
169\end{tabular}
170\end{center}
171\end{block}
172\end{frame}
173
174
175\section{Bitwise Data Parallel Regular Expression Matching}
176\begin{frame}
177\frametitle{How It Works}
178\begin{center}
179\Huge Bitwise Data Parallel Regular Expression Matching
180\end{center}
181\end{frame}
182
183\begin{frame}
184\frametitle{Beyond Byte-At-A-Time}
185\begin{itemize}
186\item Traditional regular expression technology processes one code
187  unit at a time using DFA, NFA or backtracking implementations.
188\item Instead consider a bitwise data parallel approach.
189\item Byte-oriented data is first transformed to 8 parallel bit
190  streams (Parabix transform).
191\item Bit stream $j$ consists of bit $j$ of each byte.
192\item Load 128-bit SIMD registers to process 128 positions at a time
193in bitwise data parallel fashion (SSE2, ARM Neon, ...).
194\item Or use 256-bit AVX2 registers of newer Intel processors.
195\item Process using bitwise logic, shifting and addition.
196\item Parabix methods have previously been used to accelerate
197Unicode transcoding and XML parsing.
198\end{itemize}
199\end{frame}
200
201
202\begin{frame}
203\frametitle{Unbounded Stream Abstraction}
204\begin{itemize}
205\item Program operations as if {\em all positions in the file are to
206    be processed simultaneously}.
207\item Unbounded bitwise parallelism.
208\item Pablo compiler technology maps to block-by-block processing.
209\item Information flows between blocks using carry bits.
210\item LLVM compiler infrastructure for Just-in-Time compilation.
211\item Custom LLVM improvements further accelerate processing.
212\end{itemize}
213\end{frame}
214
215\begin{frame}[fragile]
216\frametitle{Marker Streams}
217\begin{itemize}
218\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
219\item Each marker stream $M_i$ has one bit for every input byte in the input file.
220\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
221including item $i$ in the expression occurs at position $j-1$ in the input stream.
222\item Conceptually, marker streams are computed in parallel for all positions in the file
223at once (bitwise data parallelism).
224\item In practice, marker streams are computed block-by-block, where the
225block size is the size of a SIMD register in bits.
226\end{itemize}
227\end{frame}
228
229\begin{frame}[fragile]
230\frametitle{Marker Stream Example}
231\begin{itemize}[<+->]
232\item Consider matching regular expression \verb`a[0-9]*[z9]` against the input text below.
233\item<2->$M_1$ marks positions after occurrences of \verb:a:.
234\item<3->$M_2$ marks positions after occurrences of \verb:a[0-9]*:.
235\item<4->$M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
236\end{itemize}
237
238\begin{tabular}{cr}\\
239input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
240\onslide<2->$M_1$ & \verb`.1...........1...1.........1.....`\\
241\onslide<3->$M_2$ & \verb`.1111........1...111111....111...`\\
242\onslide<4->$M_3$ & \verb`.....1........1.....1.11......1..`
243\end{tabular}
244\end{frame}
245
246
247\begin{frame}[fragile]
248\frametitle{Character Class Formation}
249\begin{itemize}
250\item Combining 8 bits of a code unit gives a character class stream.
251\item CCC({\tt cc\_a = [a]})
252\item 
253\begin{semiverbatim}
254temp1 = (bit[1] &~ bit[0])
255temp2 = (bit[2] &~ bit[3])
256temp3 = (temp1 & temp2)
257temp4 = (bit[4] | bit[5])
258temp5 = (bit[7] &~ bit[6])
259temp6 = (temp5 &~ temp4)
260cc\_a = (temp3 & temp6)
261\end{semiverbatim}
262\end{itemize}
263
264\end{frame}
265
266\begin{frame}[fragile]
267\frametitle{Character Class Ranges}
268\begin{itemize}
269\item Ranges of characters are often very simple to compute.
270\item CCC({\tt cc\_0\_9 = [0-9]})
271\item 
272\begin{semiverbatim}
273temp7 = (bit[0] | bit[1])
274temp8 = (bit[2] & bit[3])
275temp9 = (temp8 &~ temp7)
276temp10 = (bit[5] | bit[6])
277temp11 = (bit[4] & temp10)
278cc\_0\_9 = (temp9 &~ temp11)
279\end{semiverbatim}
280\end{itemize}
281\end{frame}
282
283
284\begin{frame}[fragile]
285\frametitle{Character Class Common Subexpressions}
286\begin{itemize}
287\item Multiple definitions use common subexpressions.
288\item CCC({\tt cc\_z9 = [z9]})
289\item 
290\begin{semiverbatim}
291temp12 = (bit[4] &~ bit[5])
292temp13 = (temp12 & temp5)
293temp14 = (temp9 & temp13)
294temp15 = (temp1 & temp8)
295temp16 = (bit[6] &~ bit[7])
296temp17 = (temp12 & temp16)
297temp18 = (temp15 & temp17)
298cc\_z9 = (temp14 | temp18)
299\end{semiverbatim}
300\end{itemize}
301\end{frame}
302
303\begin{frame}[fragile]
304\frametitle{Matching Character Class Repetitions with MatchStar}
305\begin{itemize}[<+->]
306\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C)  \oplus C) \vee M$
307\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
308\item<4->Use addition to scan each marker through the class.
309\item<5->Bits that change represent matches.
310\item<6->We also have matches at start positions in $M_1$.
311\end{itemize}
312
313\begin{tabular}{cr}\\
314\onslide<3->input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
315\onslide<3->$M_1$ & \verb`.1...........1...1.........1.....`\\
316\onslide<3->$C = \text{\tt [0-9]}$ & \verb`.111....1........11111.....11.1..`\\
317\onslide<4->$T_0 = M_1 \wedge C$ & \verb`.1...............1.........1.....`\\
318\onslide<4->$T_1 = T_0 + C$ & \verb`....1...1.............1......11..`\\
319\onslide<5->$T_2 = T_1 \oplus C$ & \verb`.1111............111111....111...`\\
320\onslide<6->$M_2 = T_2 \vee M_1$ & \verb`.1111........1...111111....111...`
321\end{tabular}
322\end{frame}
323
324
325\begin{frame}[fragile]
326\frametitle{Matching Equations}
327The rules for bitwise data parallel regular expression matching can
328be summarized by these equations.
329\begin{eqnarray*}
330\mbox{Match}(m, C) & = &  \mbox{Advance}(\mbox{CharClass}(C) \wedge m) \\
331\mbox{Match}(m, RS) & = &  \mbox{Match}(\mbox{Match}(m, R), S) \\
332\mbox{Match}(m, R|S) & = &  \mbox{Match}(m, R) \vee \mbox{Match}(m, S)) \\
333\mbox{Match}(m, C*) & = &  \mbox{MatchStar}(m, \mbox{CharClass}(C)) \\
334\mbox{Match}(m, R*) & = &  m \vee \mbox{Match}(\mbox{Match}(m, R), R*) \\
335\mbox{Advance}(m) & = & m+m \\
336\mbox{MatchStar}(m, C) & = & (((m \wedge C) + C)  \oplus C) \vee m
337\end{eqnarray*}
338The recursive equation is implemented with a while loop.
339\end{frame}
340
341\section{Architecture}
342\begin{frame}
343\frametitle{Putting It All Together}
344\begin{center}
345\Huge Architecture
346\end{center}
347\end{frame}
348
349\begin{frame}[fragile]
350\frametitle{Compilation Architecture}
351\begin{center}
352
353\pgfdeclarelayer{phases}
354\pgfsetlayers{phases,main}
355\tikzfading[name=fade down,
356  top color=transparent!0, bottom color=transparent!100]
357
358% Define block styles
359%\tikzstyle{decision} = [diamond, shape aspect=2, rotate=30, draw, text width=4.5em, text badly centered, inner sep=0pt, thick]
360\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.5em, thick, font=\ttfamily\bfseries, node distance=2.5em]
361\tikzstyle{line} = [draw, ->, line width=1.4pt]
362\tikzstyle{separator} = [draw, line width=0.125em, dashed]
363\tikzset{block/.append style={execute at begin node=\footnotesize}}   
364\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
365
366    % Place nodes
367    \node [draw=none] (RE) {\RegularExpression{}};
368    \node [block, below of=RE] (REParser) {\REParser{}};
369    \node [block, below of=REParser] (RETransform) {\RegularExpression{} Transformations};   
370%    \coordinate[below of=RETransform, node distance=3em] (Point);   
371%    \node [block, left of=Point, node distance=10em] (CUCompiler) {\CodeUnitCompiler{}};
372%    \node [block, right of=Point, node distance=10em] (RECompiler) {\RegularExpressionCompiler{}};
373    \node [block, below of=RETransform] (RECompiler) {\RegularExpressionCompiler{}};
374    \node [block, below of=RECompiler] (PabloTransform) {\Pablo{} Transformations};   
375    \node [block, below of=PabloTransform] (PabloCompiler) {\PabloCompiler{}};
376    \node [block, below of=PabloCompiler] (LLVMCompiler) {LLVM Compiler};
377    \node [draw=none, below of=LLVMCompiler, node distance=3.5em] (Matcher) {Dynamically-Generated Match Function};
378   
379    % Draw edges
380    \path [line] (RE) -- (REParser);
381    %\path [line] (RE) -- (PropertyExtraction);
382    \path [line] (REParser) -- (RETransform);
383%    \path [line] (RETransform) -| (CUCompiler);
384%    \path [line] (RETransform) -| (RECompiler);
385%    \path [line] (CUCompiler) |- (PabloTransform);
386 %   \path [line] (RECompiler) |- (PabloTransform);
387    \path [line] (RETransform) -- (RECompiler);
388    \path [line] (RECompiler) -- (PabloTransform);
389    \path [line] (PabloTransform) -- (PabloCompiler);
390    \path [line] (PabloCompiler) -- (LLVMCompiler);
391    \path [line] (LLVMCompiler) -- (Matcher);
392   
393    % Draw layers
394    \coordinate[right of=REParser, node distance=15em] (SR);
395    \coordinate[left of=REParser, node distance=15em] (SL);
396    \path [separator] (SL) -- (REParser);
397    \path [separator] (REParser) -- (SR);
398   
399    \coordinate[left of=RECompiler, node distance=15em] (PL);
400    \coordinate[right of=RECompiler, node distance=15em] (PR);
401    \path [separator] (PL) -- (RECompiler);
402    \path [separator] (RECompiler) -- (PR);
403    %\path [separator] (PL) -- (CUCompiler);
404%    \path [separator] (CUCompiler) -- (RECompiler);
405    %\path [separator] (RECompiler) -- (PR);
406
407    \coordinate[right of=PabloCompiler, node distance=15em] (LR);
408    \coordinate[left of=PabloCompiler, node distance=15em] (LL);
409    \path [separator] (LL) -- (PabloCompiler);
410    \path [separator] (PabloCompiler) -- (LR);   
411   
412    \coordinate[right of=LLVMCompiler, node distance=15em] (OR);
413    \coordinate[left of=LLVMCompiler, node distance=15em] (OL);
414    \path [separator] (OL) -- (LLVMCompiler);
415    \path [separator] (LLVMCompiler) -- (OR);       
416   
417    % Seperator text
418    \node [draw=none,anchor=west] at ($(SL)!0.5!(PL)$) {1)~\RegularExpression{} AST};
419    \node [draw=none,anchor=west] at ($(PL)!0.5!(LL)$) {2)~\Pablo{}};
420    \node [draw=none,anchor=west] at ($(LL)!0.5!(OL)$) {3)~LLVM};
421
422    \begin{pgfonlayer}{phases}
423        \path[fill=green!20,path fading=fade down, draw=none] (SL) rectangle (PR);
424        \path[fill=blue!20,path fading=fade down, draw=none] (PL) rectangle (LR);
425        \path[fill=red!20,path fading=fade down, draw=none] (LL) rectangle (OR);
426    \end{pgfonlayer}
427
428\end{tikzpicture}
429
430\end{center}
431\end{frame}
432
433\begin{frame}[fragile]
434\frametitle{Execution Architecture}
435
436\begin{center}
437
438\pgfdeclarelayer{threads}
439\pgfdeclarelayer{components}
440\pgfsetlayers{threads,main}
441
442\tikzstyle{block} = [rectangle, draw, fill=white, text width=12em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
443\tikzstyle{line} = [draw, ->, line width=1.4pt]
444\tikzstyle{separator} = [draw, line width=0.125em, dashed]
445\tikzstyle{thread} = [rectangle, corners=rounded, draw, text width=15em, text centered, minimum height=1.75em, thick, font=\ttfamily\bfseries, node distance=3.5em]
446\tikzset{block/.append style={execute at begin node=\footnotesize}}   
447\begin{tikzpicture}[node distance=3cm, auto, >=stealth]
448
449    % Place nodes
450    \node [draw=none] (InputData) {Streaming Input Data};
451    \node [block, below of=InputData] (S2P) {Transposition};
452    \node [block, below of=S2P] (RequiredStreamsGenerator) {Required Streams Generator};
453    \node [block, below of=RequiredStreamsGenerator] (JITFunction) {Dynamic Matcher};
454    \node [block, right of=JITFunction, node distance=16em] (NamedPropertyLibrary) {Named Property Library};
455    \node [block, below of=JITFunction] (MatchScanner) {Match Scanner};
456    \node [draw=none, below of=MatchScanner, node distance=3.5em] (OutputResult) {Streaming Output Result};
457   
458    % Draw edges
459    \path [line] (InputData) -- (S2P);
460    \path [line] (S2P) -- (RequiredStreamsGenerator);
461    \path [line] (RequiredStreamsGenerator) -- (JITFunction);
462    \path [line] (NamedPropertyLibrary) -- (JITFunction);
463    \path [line] (JITFunction) -- (MatchScanner);
464    \path [line] (MatchScanner) -- (OutputResult);
465   
466    \begin{pgfonlayer}{threads}
467        \path (S2P.north west)+(-.1,.5) node (a) {};
468        \path (S2P.south east)+(+1.7,-0.1) node (b) {};
469        \path[fill=green!20,rounded corners, draw=green, solid] (a) rectangle (b);
470        \node [draw=none,above=-0.04cm of S2P.north east] (t1) {};
471
472        \path (RequiredStreamsGenerator.north west)+(-.1,.38) node (a) {};
473        \path (RequiredStreamsGenerator.south east)+(+2,-0.1) node (b) {};
474        \path[fill=blue!20,rounded corners, draw=blue, solid] (a) rectangle (b);
475        \node [draw=none,above=-0.04cm of RequiredStreamsGenerator.north east] (t1) {};
476
477        \path (JITFunction.north west)+(-.1,.38) node (a) {};
478        \path (NamedPropertyLibrary.east |- MatchScanner.south)+(+.1,-0.1) node (b) {};
479        \path[fill=red!20,rounded corners, draw=red, solid] (a) rectangle (b);
480        \node [draw=none,above=-0.04cm of JITFunction.north east] (t1) {};
481
482    \end{pgfonlayer}{threads}
483\end{tikzpicture}
484
485\end{center}
486\end{frame}
487
488\section{Unicode Regular Expression Matching with Parabix}
489\begin{frame}
490\frametitle{Unicode}
491\begin{center}
492\Huge UTF-8 Regular Expression Matching
493\end{center}
494\end{frame}
495
496\begin{frame}[fragile]
497\frametitle{UTF-8 Byte Classification and Validation}
498
499\begin{eqnarray*}
500\mbox{ASCII} & = & \mbox{CharClass}(\verb:[\x00-\x7F]:) \\
501\mbox{Prefix} & = & \mbox{CharClass}(\verb:[\xC2-\F4]:) \\
502\mbox{Pfx3or4} & = & \mbox{CharClass}(\verb:[\xE0-\xF4]:) \\
503\mbox{Prefix4} & = & \mbox{CharClass}(\verb:[\xF0-\xF4]:) \\
504\mbox{Suffix} & = & \mbox{CharClass}(\verb:[\x80-\xBF]:) \\
505\mbox{Scope} & = & \mbox{Advance}(\mbox{Prefix}) \vee \mbox{Advance}(\mbox{Pfx3or4},2) \vee \mbox{Advance}(\mbox{Prefix4}, 3) \\
506\mbox{Mismatch} & = & \mbox{Scope} \oplus \mbox{Suffix} \\
507\mbox{Initial} & = & \mbox{ASCII} \vee \mbox{Prefix} \\
508\mbox{NonFinal} & = & \mbox{Prefix} \vee \mbox{Advance}(\mbox{Pfx3or4}) \vee \mbox{Advance}(\mbox{Prefix4}, 2)
509\end{eqnarray*}
510\end{frame}
511
512
513\begin{frame}[fragile]
514\frametitle{UTF-8 Matching Operations}
515
516Advance and Matchstar can both be adapted for UTF-8 matching.
517Let U8Class($U$) be the stream marking the {\em final} UTF-8 byte position of
518occurrences of members of the class.
519
520\begin{eqnarray*}
521\mbox{Match}(m, U) & = & \mbox{Advance}(\mbox{ScanThru}(m, \mbox{NonFinal}) \wedge \mbox{U8Class}(U)) \\
522\mbox{Match}(m, U*) & = & \mbox{MatchStar}(m, \mbox{U8Class}(U) \vee \mbox{NonFinal}) \wedge \mbox{Initial}\\
523\mbox{ScanThru}(m, C) & = & (m + C) \wedge \neg C\\
524\end{eqnarray*}
525\end{frame}
526
527\begin{frame}[fragile]
528\frametitle{Simple UTF-8 Matching Operations}
529
530
531\begin{CJK}{UTF8}{gbsn}
532\begin{itemize}
533\item Search for 䜠奜 in UTF-8 text.
534\item Let ni3 represent the three byte sequence for 䜠.
535\item Let hao represent the three byte sequence for 奜。
536\item Let men represent the three byte sequence for 们。
537\end{itemize}
538\end{CJK}
539\begin{center}
540%\begin{tabular}{cr}\\
541\begin{tabular}{c@{\hspace{1em}}r}\\
542input data                                                         & \verb`ni3hao(Hello),ni3men(You),`\\
543$\text{CC}_{\text{ni3}}$                                           & \verb`..1.............1.........`\\
544$\text{CC}_{\text{hao}}$                                           & \verb`.....1....................`\\
545$m_0 = \mbox{\rm Initial}$                                         & \verb`1..1..111111111..1..111111`\\
546NonFinal                                                           & \verb`11.11.........11.11.......`\\
547$t_1 = \text{ScanThru}(m_0, \text{NonFinal})$                      & \verb`..1..111111111..1..1111111`\\
548$m_1 = \text{Advance}(t_1 \land \text{CC}_{\text{ni3}})$           & \verb`...1.............1........`\\
549$t_2 = \text{ScanThru}(m_1, \text{NonFinal})$                      & \verb`.....1.............1......`\\
550$m_2 = \text{Advance}(t_2 \land CC_{\text{hao}})$                  & \verb`......1...................`
551\end{tabular}
552\end{center}
553
554\end{frame}
555
556
557\begin{frame}[fragile]
558\frametitle{If Hierarchies}
559
560\begin{itemize}
561\item Unicode property classes may contain many thousands of codepoints.
562\item Evaluating all required equations may be very expensive.
563\item However, any 128-byte segment of input will involve only a few codepoint ranges.
564\item Evaluation of property classes is embedded in if-hierarchies of successively
565finer ranges.
566\item This techique greatly reduces the number of equations evaluated for each property.
567\end{itemize}
568
569\end{frame}
570
571\begin{frame}
572\frametitle{iCgrep 1.0}
573icGrep 1.0 provides full Unicode Level 1 support.
574\begin{itemize}
575\item RL1.1 Hex Notation
576\item RL1.2 Properties
577\begin{itemize}
578\item General\_Category
579\item Script and Script\_Extensions
580\item Alphabetic, Uppercase, Lowercase, White\_Space,
581  Noncharacter\_Code\_Point, Default\_Ignorable\_Code\_Point
582\end{itemize}
583\item RL1.3 Subtraction and Intersection
584\item RL1.4 Simple Word Boundaries
585\item RL 1.5 Simple Loose Matches
586\item RL1.6 Line Boundaries
587\item RL1.7 Supplementary Code Points
588\end{itemize}
589
590\end{frame}
591\begin{frame}[fragile]
592\frametitle{iCgrep-devel r4850}
593
594Current status of Unicode Level 2 support in icGrep.
595\begin{itemize}
596\item 2.1 Canonical Equivalents: TO DO (only in Grapheme Cluster Mode)
597\item 2.2 Extended Grapheme Clusters: \verb:\b{g}: implemented
598\item 2.2.1 Grapheme Cluster Mode: \verb:(?g): implemented
599\item 2.3 Default Word Boundaries: currently Level 1
600\item 2.4 Default Case Conversion: currently Level 1
601\item 2.5 Name Properties: implemented
602\item 2.6 Wildcards in Property Values: Regular Experession in Name Properties
603\item 2.7 Full Properties: Most Properties implemented
604
605\end{itemize}
606\end{frame}
607
608
609\begin{frame}[fragile]
610\frametitle{Emoji Regular Expressions!}
611\begin{itemize}
612\item Individual Emoji can be entered as characters in regular expressions.
613\item Or as hex codepoints.
614\item Even better: search by name pattern:
615\begin{itemize}
616\item Create a pattern for Emoji character names:
617  \verb`\bSMIL(E|ING)\b`
618\item Search documents for any character having the name pattern.
619\verb`icgrep '\N{\bSMIL(E|ING)\b}'`
620\end{itemize}
621\end{itemize}
622\end{frame}
623
624
625\section{Performance Study: Unicode Set Operations}
626\begin{frame}[fragile]
627\frametitle{}
628\begin{center}
629\huge Performance Study: Unicode Set Operations
630\end{center}
631\begin{center}
632\large \verb`[\p{Greek}&&\{lowercase}]`
633\end{center}
634
635\end{frame}
636
637
638\begin{frame}
639\frametitle{Performance Study Set-up}
640\begin{itemize}
641\item Search for lines containing characters in computed sets.
642\item Set expressions combine general category and script properties.
643\item Set intersection, union, difference, negation.
644\item Cover all general categories and scripts.
645\item 246 test expressions in all.
646\item Evaluate icgrep 1.0 vs. ugrep541 and pcre2grep.
647\item Measure performance against a large collection of Wikimedia
648  documents.
649\end{itemize}
650\end{frame}
651
652\begin{frame}
653\frametitle{Performance Comparison}
654\begin{center}
655\pgfplotstableread[col sep = comma]{data/icgrep-scatter.csv}\icgrep
656\pgfplotstableread[col sep = comma]{data/ugrep541-scatter.csv}\ugrep
657\pgfplotstableread[col sep = comma]{data/pcre2grep-scatter.csv}\pcre
658
659\begin{tikzpicture}
660\begin{axis}[
661grid=both,
662x tick label style={/pgf/number format/1000 sep=},
663ylabel={CPU Cycles Per Byte},
664xlabel={Percentage of Matching Lines},
665minor y tick num={1},
666xmax=100,
667height=0.5\textwidth,
668legend style={at={(1.05,0.5)},
669anchor=west,legend columns=1,
670align=left,draw=none,column sep=2ex}
671]
672\addplot+[no markers,line width=2pt,color=blue!60,solid] table {\icgrep};
673\addplot+[no markers,line width=2pt,color=red!60,solid] table {\ugrep};
674\addplot+[no markers,line width=2pt,color=brown!60,solid] table {\pcre};
675\legend{icgrep1.0,ugrep541,pcre2grep}
676\end{axis}
677
678
679\end{tikzpicture}
680\end{center}
681%\caption{Matching Performance for Simple Property Expressions}\label{fig:property_test}
682\end{frame}
683
684\section{Conclusion}
685\begin{frame}[fragile]
686\frametitle{Results}
687\begin{itemize}%[<+->]
688\item Regular expression matching using bitwise data parallelism is well-suited
689to Unicode regular expression matching requirements.
690
691\item Performance is ultra-fast and robust against all kinds of nondeterminstic
692regular expressions.
693
694\item Performance improves as SIMD processor width increases.
695
696\item Unicode level 1 requirements fully met.
697
698\item Unicode level 2 requirements partially met:
699\begin{itemize}
700\item grapheme cluster boundaries \verb:\b{g}:
701\item grapheme cluster mode: \verb:(?g):
702\item Unicode name expressions search \verb:\N{LATIN.*\b[A-G]\b}:
703\item Broad Unicode property support
704\end{itemize}
705
706\item Open source implementation available: {\tt http://parabix.costar.sfu.ca/wiki/ICgrep}
707\end{itemize}
708
709\end{frame}
710
711
712\end{document}
Note: See TracBrowser for help on using the repository browser.