source: docs/Working/re/Slides/pactslides.tex @ 4083

Last change on this file since 4083 was 4083, checked in by cameron, 5 years ago

Updates

File size: 17.8 KB
Line 
1\documentclass{beamer}
2\usepackage{default}
3\usepackage{color}
4\usepackage[latin1]{inputenc}
5\usepackage{verbatim}
6\usepackage{colortbl}
7\usepackage{graphicx}
8\usepackage{tikz}
9\usepackage{multirow}
10\usepackage{pgfplots}
11
12\usetheme{Warsaw}
13\setbeamertemplate{navigation symbols}{}
14
15\title[PACT 2014: Parabix grep]{Bitwise Data Parallelism in Regular Expression Matching}
16\author[Cameron, Shermer, Shriraman, Herdy, Lin, Hull, and Lin]{Rob Cameron, Tom Shermer, Arrvindh Shriraman, Ken Herdy, Dan Lin, Ben Hull, Meng Lin}
17\institute[CS@SFU]{School of Computing Science\\
18Simon Fraser University}
19\date{August 25, 2014}
20
21\begin{document}
22
23\begin{frame}
24\titlepage
25\end{frame}
26
27
28\begin{frame}
29\frametitle{Outline}
30\tableofcontents%[pausesections]
31\end{frame}
32
33\section{Introduction}
34
35\begin{frame}[fragile]
36\frametitle{Acceleration of Regular Expression Matching}
37
38\begin{block}<+->{Sequential Matching Problems}
39\begin{itemize}[<+->]
40\item Example: quickly find instances of \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]): in text.
41\item Three families of sequential algorithms: NFAs, DFAs or backtracking.
42\end{itemize}
43\end{block}
44\begin{block}<+->{Parallelization}
45\begin{itemize}
46\item Berkeley ParLab:
47\begin{itemize}
48\item FSMs are 13th dwarf: hardest to parallelize.
49\item Embarassingly sequential?
50\end{itemize}
51\item Parallel application of FSMs to multiple input streams (SIMD/GPU).
52\item Promising recent work: coalesced FSMs, principled speculation.
53\end{itemize}
54\end{block}
55\end{frame}
56
57
58\begin{frame}[fragile]
59\frametitle{Our Approach}
60
61\begin{itemize}[<+->]
62\item A fundamentally parallel approach in contrast to approaches
63that parallelize existing DFA or NFA algorithms.
64
65\item A new algorithm {\em family} for regular expression matching based
66on bitwise data parallelism.
67
68\item Builds on the Parabix methods that have been used for XML
69parsing and Unicode transcoding.
70
71\item Introduces the MatchStar primitive
72for simultaneous nondeterministic matching
73of character class repetitions using bitstream addition.
74
75\item Compilation technologies for regular expressions (new), character classes (existing),
76unbounded bitstreams (existing).
77
78\item Recent work: all compilers integrated together with LLVM for
79fully dynamic regular expression matching.
80\end{itemize}
81\end{frame}
82
83\begin{frame}[fragile]
84\frametitle{Bitwise Data Parallelism}
85
86\begin{itemize}[<+->]
87\item Parabix methods use a transform representation of text.
88\item Bitstreams are formed using one bit per input byte.
89\item Eight basis bit streams are defined for bit 0, bit 1, ... bit 7 of each byte.
90\item Perform bitwise processing with wide SIMD registers.
91\begin{itemize}
92\item Process 128 bytes at a time with SSE2, Neon, Altivec.
93\item Process 256 bytes at a time with AVX2.
94\end{itemize}
95\item Parabix transform is fast using SIMD pack operations.
96\item {\em But}: programmer view is of unbounded, arbitrary length bitstreams.
97\end{itemize}
98
99\end{frame}
100
101\begin{frame}[fragile]
102\frametitle{Impressive Results in Full Unicode Matching}
103
104\begin{itemize}%[<+->]
105\item \verb:(^|[ ])\p{Lu}\p{Ll}+[.!?]($|[ ]):
106\begin{itemize}
107\item Match against 110 MB Arabic wikimedia document.
108\item Find capitalized words at ends of sentences.
109\item Use Unicode upper/lower case categories.
110\end{itemize}
111\item \verb:egrep: 45,951,194,784 CPU cycles.
112\item \verb:pcregrep: 14,772,797,548 CPU cycles.
113\item \verb:icgrep: (Parabix) 653,530,064 CPU cycles.
114\item 20X acceleration over \verb:pcgregrep:, 70X over GNU \verb:egrep:.
115\end{itemize}
116\end{frame}
117
118
119\section{Parabix Technology}
120
121\begin{frame}
122\frametitle{Parallel Bit Streams: A Transform Representation of Text}
123\begin{itemize}
124\item Given a byte-oriented character stream $T$, e.g., ``{\tt Ab17;}''.
125\item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$.
126\item Each stream $b_k$ comprises bit $k$ of each byte of $T$.
127\end{itemize}
128\begin{tabular}{|c|c|c|c|c|c|} \hline
129$T$ & A & b & 1 & 7 & ; \\ \hline
130ASCII & \alert<2>{0}\alert<3>{1}\alert<4>{0}00001 & \alert<2>{0}\alert<3>{1}\alert<4>{1}00010 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10001 & \alert<2>{0}\alert<3>{0}\alert<4>{1}10111 & \alert<2>{0}\alert<3>{0}\alert<4>{1}11011 \pause \\ \hline
131$b_0$ & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} & \alert<2>{0} \pause \\ \hline
132$b_1$ & \alert<3>{1} & \alert<3>{1} & \alert<3>{0} & \alert<3>{0} & \alert<3>{0} \pause \\ \hline
133$b_2$ & \alert<4>{0} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1} & \alert<4>{1}  \\ \hline
134$b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline
135$b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline
136$b_5$ & 0 & 0 & 0 & 1 & 0 \\ \hline
137$b_6$ & 0 & 1 & 0 & 1 & 1 \\ \hline
138$b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline
139\end{tabular}
140\end{frame}
141
142\begin{frame}
143\frametitle{Parabix Programming}
144
145\begin{itemize}[<+->]
146\item Parabix programs written as unbounded bit stream operations.
147\item Unbounded bit streams considered as arbitrarily large integers.
148\item Fundamental operations: bitwise logic, bit-stream shifting and long-stream addition.
149\item Parabix tool chain has three components:
150\begin{itemize}
151\item Character Class Compiler (CCC) produces stream equations from character classes.
152\item Parallel Block Compiler (Pablo) converts unbounded stream programs to C++/SIMD.
153\item Portable SIMD library for multiple architectures.
154\end{itemize}
155\end{itemize}
156\end{frame}
157
158\begin{frame}[fragile]
159\frametitle{Character Class Formation}
160\begin{itemize}[<+->]
161\item Combining 8 bits of a code unit gives a character class stream.
162\item CCC({\tt cc\_a = [a]})
163\item 
164\begin{semiverbatim}
165temp1 = (bit[1] &~ bit[0])
166temp2 = (bit[2] &~ bit[3])
167temp3 = (temp1 & temp2)
168temp4 = (bit[4] | bit[5])
169temp5 = (bit[7] &~ bit[6])
170temp6 = (temp5 &~ temp4)
171cc\_a = (temp3 & temp6)
172\end{semiverbatim}
173\end{itemize}
174
175\end{frame}
176
177\begin{frame}[fragile]
178\frametitle{Character Class Ranges}
179\begin{itemize}%[<+->]
180\item Ranges of characters are often very simple to compute.
181\item CCC({\tt cc\_0\_9 = [0-9]})
182\item 
183\begin{semiverbatim}
184temp7 = (bit[0] | bit[1])
185temp8 = (bit[2] & bit[3])
186temp9 = (temp8 &~ temp7)
187temp10 = (bit[5] | bit[6])
188temp11 = (bit[4] & temp10)
189cc\_0\_9 = (temp9 &~ temp11)
190\end{semiverbatim}
191\end{itemize}
192\end{frame}
193
194
195\begin{frame}[fragile]
196\frametitle{Character Class Common Subexpressions}
197\begin{itemize}
198\item Multiple definitions use common subexpressions.
199\item CCC({\tt cc\_z9 = [z9]})
200\item 
201\begin{semiverbatim}
202temp12 = (bit[4] &~ bit[5])
203temp13 = (temp12 & temp5)
204temp14 = (temp9 & temp13)
205temp15 = (temp1 & temp8)
206temp16 = (bit[6] &~ bit[7])
207temp17 = (temp12 & temp16)
208temp18 = (temp15 & temp17)
209cc\_z9 = (temp14 | temp18)
210\end{semiverbatim}
211\end{itemize}
212\end{frame}
213
214\section{Regular Expression Matching with Parabix}
215
216\begin{frame}[fragile]
217\frametitle{Marker Streams}
218\begin{itemize}[<+->]
219\item Marker stream $M_i$ indicates the positions that are reachable after item $i$ in the regular expression.
220\item Each marker stream $M_i$ has one bit for every input byte in the input file.
221\item $M_i[j] = 1$ if and only if a match to the regular expression up to and
222including item $i$ in the expression occurs at position $j-1$ in the input stream.
223\item Conceptually, marker streams are computed in parallel for all positions in the file
224at once (bitwise data parallelism).
225\item In practice, marker streams are computed block-by-block, where the
226block size is the size of a SIMD register in bits.
227\end{itemize}
228\end{frame}
229
230\begin{frame}[fragile]
231\frametitle{Marker Stream Example}
232\begin{itemize}[<+->]
233\item Consider matching regular expression \verb`a[0-9]*[z9]` against the input text below.
234\item<2->$M_1$ marks positions after occurrences of \verb:a:.
235\item<3->$M_2$ marks positions after occurrences of \verb:a[0-9]*:.
236\item<4->$M_3$ marks positions after occurrences of \verb:a[0-9]*[z9]:.
237\end{itemize}
238
239\begin{tabular}{cr}\\
240input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
241\onslide<2->$M_1$ & \verb`.1...........1...1.........1.....`\\
242\onslide<3->$M_2$ & \verb`.1111........1...111111....111...`\\
243\onslide<4->$M_3$ & \verb`.....1........1.....1.11......1..`
244\end{tabular}
245\end{frame}
246
247\begin{frame}[fragile]
248\frametitle{Matching Character Class Repetitions with MatchStar}
249\begin{itemize}[<+->]
250\item<2->$\text{MatchStar}(M, C) = (((M \wedge C) + C)  \oplus C) \vee M$
251\item<3->Consider $M_2 = \text{MatchStar}(M_1, C)$
252\item<4->Use addition to scan each marker through the class.
253\item<5->Bits that change represent matches.
254\item<6->We also have matches at start positions in $M_1$.
255\end{itemize}
256
257\begin{tabular}{cr}\\
258\onslide<3->input data  & \verb`a453z--b3z--az--a12949z--ca22z7--`\\
259\onslide<3->$M_1$ & \verb`.1...........1...1.........1.....`\\
260\onslide<3->$C = \text{\tt [0-9]}$ & \verb`.111....1........11111.....11.1..`\\
261\onslide<4->$T_0 = M_1 \wedge C$ & \verb`.1...............1.........1.....`\\
262\onslide<4->$T_1 = T_0 + C$ & \verb`....1...1.............1......11..`\\
263\onslide<5->$T_2 = T_1 \oplus C$ & \verb`.1111............111111....111...`\\
264\onslide<6->$M_2 = T_2 \vee M_1$ & \verb`.1111........1...111111....111...`
265\end{tabular}
266\end{frame}
267
268
269\begin{frame}[fragile]
270\frametitle{Regular Expression Compilation}
271\begin{itemize}
272\item Our regular expression compiler produces unbounded Pablo code.
273\item RE\_compile(\verb`a[0-9]*[z9]`)
274\item
275\begin{semiverbatim}
276m0 = ~0
277m1 = pablo.Advance(m0 & cc\_a)
278m2 = pablo.MatchStar(m1, cc\_0\_9)
279m3 = pablo.Advance(m2, cc\_z9)
280\end{semiverbatim}
281\end{itemize}
282\end{frame}
283
284\begin{frame}[fragile]
285\frametitle{Alternations and Optional Terms}
286\begin{itemize}
287\item Most RE features are handled naturally.
288\item RE\_compile(\verb`a(b?|cd)`)
289\item
290\begin{semiverbatim}
291m0 = ~0
292m1 = pablo.Advance(m0 & cc\_a)
293m2 = pablo.MatchStar(m1, cc\_b)
294m3 = m1 | m2      # b is optional
295m4 = pablo.Advance(m1, cc\_c) 
296m5 = pablo.Advance(m2, cc\_d)
297m6 = m3 | m5      # two alternatives
298\end{semiverbatim}
299\end{itemize}
300\end{frame}
301
302
303\begin{frame}[fragile]
304\frametitle{Nested Repetitions Use While Loops}
305\begin{itemize}
306\item While loops are used for complex or nested repetitions.
307\item RE\_compile(\verb`(a[0-9]*[z9])*`)
308\item
309\begin{semiverbatim}
310m0 = ~0
311t = m0   # while test variable
312a = m0   # while result accumulator
313while t:
314   m1 = pablo.Advance(t & cc\_a)
315   m2 = pablo.MatchStar(m1, cc\_0\_9)
316   m3 = pablo.Advance(m2, cc\_z9)
317   t = m3 &~ a   # iterate only for new matches
318   a = a | m3
319\end{semiverbatim}
320\end{itemize}
321\end{frame}
322
323\section{Block-by-Block Processing}
324
325\begin{frame}[fragile]
326\frametitle{Pablo Compiler Compiles to Block-by-Block Form}
327\begin{block}<+->{Pablo Input}
328\begin{semiverbatim}
329m0 = ~0
330m1 = pablo.Advance(m0 & cc\_a)
331m2 = pablo.MatchStar(m1, cc\_0\_9)
332m3 = pablo.Advance(m2, cc\_z9)
333\end{semiverbatim}
334
335\end{block}
336\begin{block}<+->{Pablo Output Adds Carry Processing}
337\begin{semiverbatim}
338m0 = simd_not(simd<1>::constant<0>());
339cryQ.set(0, blkShl(blkAnd(m0, c\c_a), cryQ.get(0), m1);
340cryQ.set(1, blkMatchStar(m1, cc\_0\_9, cryQ.get(1), m2);
341cryQ.set(2, blkShl(blkAnd(m2, cc\_z9), cryQ.get(2), m3);
342cryQ.AdjustCarries(3);
343\end{semiverbatim}
344\end{block}
345\end{frame}
346
347
348\begin{frame}[fragile]
349\frametitle{Long-Stream Addition}
350\begin{itemize}[<+->]
351\item Processors limit addition to 64 bits.
352\item SIMD blocks may be 128, 256 or (soon) 512 bits.
353\item Addition traditionally requires sequential propagation of
354  carries between 64-bit fields.
355\item However, a parallel method is possible.
356\begin{itemize}
357\item Extract one carry and one ``bubble'' bit each from 64-bit
358  fields.
359\item Apply MatchStar to produce a mask of ``increment'' bits.
360\item Spread the increment bits to the 64-bit fields and add.
361\end{itemize}
362\item Potentially supports 4096 bit addition using 64-bit adders.
363\end{itemize}
364\end{frame}
365
366\begin{frame}[fragile]
367\frametitle{Long-Stream Addition}
368Compute the long-stream sum {\tt Z} = {\tt X} + {\tt Y}.
369\begin{tabular}{c||r|r|r|r|r|r|r|r||}\cline{2-9}
370{\tt X} & {\tt 19} & {\tt 31} & {\tt BA} & {\tt 4C} & {\tt 3D} & {\tt 45} & {\tt 21} & {\tt F1} \\ \cline{2-9}
371{\tt Y} & {\tt 22} & {\tt 12} & {\tt 45} & {\tt B3} & {\tt E2} & {\tt 16} & {\tt 17} & {\tt 36} \\ \cline{2-9}
372\alert<3>{\tt R} (add digits)& {\tt 3B} & {\tt 43} & \alert<3>{\tt FF} & \alert<3>{\tt FF} & {\tt 1F} & {\tt 5B} & {\tt 38} & {\tt 27} \\ \cline{2-9}
373{\tt x} = high\_bit({\tt X})& {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} \\ \cline{2-9}
374{\tt y} = high\_bit({\tt Y}) & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
375{\tt r} = high\_bit({\tt Z}) & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
376\alert<4>{\tt c} (carries)& {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} & \alert<4>{\tt 1} & {\tt 0} & {\tt 0} & \alert<4>{\tt 1} \\ \cline{2-9}
377{\tt c*2} (shift left 1)& {\tt 0} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} & {\tt 0} & {\tt 1} & {\tt 0} \\ \cline{2-9}
378{\tt b} (bubble ({\tt Z} = {\tt FF}) & {\tt 0} & {\tt 0} & \alert<5>{\tt 1} & \alert<5>{\tt 1} & {\tt 0} & {\tt 0} & {\tt 0} & {\tt 0} \\ \cline{2-9}
379\alert<6>{\tt i} = Matchstar({\tt b},{\tt c*2})& {\tt 0} & \alert<6>{\tt 1} &\alert<6> {\tt 1} & \alert<6>{\tt 1} & {\tt 0} & {\tt 0} & \alert<6>{\tt 1} & {\tt 0} \\ \cline{2-9}
380\alert<7>{\tt Z} = {\tt R} + {\tt i} & {\tt 3B} &  \alert<7>{\tt 44} & \alert<7>{\tt 0} & \alert<7>{\tt 0} & {\tt 1F} & {\tt 5B} & \alert<7>{\tt 39} & {\tt 27} \\ \cline{2-9}
381\end{tabular}
382\end{frame}
383
384
385\section{Performance}
386
387
388\begin{frame}[fragile]
389\frametitle{Test Expressions}
390
391
392{\tiny
393\begin{tabular}{|l|l|}
394\hline
395Name            & Expression    \\ \hline   
396@               & \verb`@`              \\ \hline     
397Date            & \verb`([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)`          \\ \hline     
398Email           & \verb`([^ @]+)@([^ @]+)`              \\ \hline
399URI     & \verb`(([a-zA-Z][a-zA-Z0-9]*)://|mailto:)([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)`         \\ \hline     
400Hex             & \verb`[ ](0x)?([a-fA-F0-9][a-fA-F0-9])+[.:,?! ]`              \\ \hline
401StarHeight              & \verb`[A-Z]((([a-zA-Z]*a[a-zA-Z]*[ ])*[a-zA-Z]*e[a-zA-Z]*[ ])*[a-zA-Z]*s[a-zA-Z]*[ ])*[.?!]`          \\ \hline
402\end{tabular}
403}
404\end{frame}
405
406
407\begin{frame}[fragile]
408\frametitle{SSE2 Performance}
409\begin{tikzpicture}
410\begin{axis}[
411xtick=data,
412ylabel=Cycles per Byte,
413xticklabels={@,Date,Email,URI,Hex,StarHeight},
414tick label style={font=\tiny},
415enlarge x limits=0.15,
416%enlarge y limits={0.15, upper},
417ymin=0,
418legend style={at={(0.5,0.9)},
419anchor=north,legend columns=-1},
420ymax=44,
421ybar,
422bar width=7pt,
423%visualization depends on=y \as \rawy,
424]
425\addplot
426file {../data/cycles-bitstreams.dat};
427\addplot
428file {../data/cycles-nrgrep112.dat};
429\addplot
430file {../data/cycles-gre2p.dat};
431 
432\legend{bitstreams,nrgrep,gre2p,Annot}
433\end{axis}
434\end{tikzpicture}
435
436\end{frame}
437
438\begin{frame}[fragile]
439\frametitle{IPC}
440
441\begin{tikzpicture}
442\begin{axis}[
443xtick=data,
444ylabel=Instructions per Cycle,
445xticklabels={@,Date,Email,URI,Hex,StarHeight},
446tick label style={font=\tiny},
447enlarge x limits=0.15,
448%enlarge y limits={0.15, upper},
449ymin=0,
450legend style={at={(0.5,0.9)},
451anchor=north,legend columns=-1},
452ybar,
453bar width=7pt,
454]
455\addplot
456file {../data/ipc-bitstreams.dat};
457\addplot
458file {../data/ipc-nrgrep112.dat};
459\addplot
460file {../data/ipc-gre2p.dat};
461
462\legend{bitstreams,nrgrep,gre2p,Annot}
463\end{axis}
464\end{tikzpicture}
465\end{frame}
466
467
468\begin{frame}[fragile]
469\frametitle{SIMD Scalability}
470\begin{tikzpicture}
471\begin{axis}[
472xtick=data,
473ylabel=AVX2 Instruction Reduction,
474xticklabels={@,Date,Email,URI,Hex,StarHeight},
475tick label style={font=\tiny},
476enlarge x limits=0.15,
477%enlarge y limits={0.15, upper},
478ymin=0,
479legend style={at={(0.5,0.15)},
480anchor=north,legend columns=-1},
481ybar,
482bar width=7pt,
483]
484\addplot
485file {../data/sse2-avx2-instr-red-bitstreams.dat};
486\addplot
487file {../data/sse2-avx2-instr-red-nrgrep112.dat};
488\addplot
489file {../data/sse2-avx2-instr-red-gre2p.dat};
490
491\legend{bitstreams,nrgrep,gre2p,Annot}
492\end{axis}
493\end{tikzpicture}
494\end{frame}
495
496
497\begin{frame}[fragile]
498\frametitle{Speedups Achieved}
499
500\begin{center}
501\begin{tabular}{|c|c|c|c|} \hline
502\multirow{2}{*}{Expression} & \multicolumn{3}{c|}{Bitstream/AVX2 grep Speedup} \\ \cline{2-4}
503& vs. nrgrep & vs. gre2p & vs. GNU egrep \\ \hline \hline
504At & 3.5X & 34X & 1.6X\\ \hline
505Date & 0.76X & 13X & 48X\\ \hline
506Email & 9.5X & 28X & 12X\\ \hline
507URI & 6.6X & 27X & 518X\\ \hline
508Hex & 8.1X & 105X & 267X\\ \hline
509StarHeight & 1.9X & 7.6X & 97X\\ \hline
510\end{tabular}
511\end{center}
512\end{frame}
513
514
515\begin{frame}[fragile]
516\frametitle{GPU Performance}
517\begin{tikzpicture}
518\begin{axis}[
519xtick=data,
520ylabel=Running Time (ms per megabyte),
521xticklabels={@,Date,Email,URI,Hex,StarHeight},
522tick label style={font=\tiny},
523enlarge x limits=0.15,
524%enlarge y limits={0.15, upper},
525ymin=0,
526legend style={at={(0.5,0.8)},
527anchor=north,legend columns=-1},
528ybar,
529bar width=7pt,
530]
531\addplot
532file {../data/ssetime.dat};
533\addplot
534file {../data/avxtime.dat};
535\addplot
536file {../data/gputime.dat};
537
538\legend{SSE2,AVX2,GPU,Annot}
539\end{axis}
540\end{tikzpicture}
541
542
543\end{frame}
544
545\section{Conclusion}
546\begin{frame}[fragile]
547\frametitle{Results}
548\begin{itemize}%[<+->]
549\item A new class of parallel regular expression algorithms has been introduced
550based on the concept of bitwise data parallelism and MatchStar.
551\item Single core acceleration over sequential implementations can be dramatic.
552\item A long-stream addition technique has been developed to allow MatchStar to
553scale directly with SIMD instruction width.
554\item Perfect scaling in instruction count was observed with 256-bit AVX2
555technology versus 128-bit SIMD technology except for nested repetition.
556\item GPU implementations show promise, but need additional work.
557\end{itemize}
558\end{frame}
559
560\begin{frame}[fragile]
561\frametitle{Ongoing/Future Work}
562
563\begin{itemize}%[<+->]
564\item The prototype technologies have now been re-implemented in
565a single C++ executable combining 4 compilers.
566\begin{itemize}
567\item CCC: Character class compiler
568\item RE\_compile: regular expression compiler
569\item Pablo: Block-at-a-time compiler
570\item LLVM: Fully dynamic code generation.
571\end{itemize}
572\item Compilation overhead is high, but tolerable for large files.
573
574\item Unicode support has been added, including additional MatchStar
575algorithms for variable-length Unicode character classes.
576
577\item Open source implementation available: {\tt http://parabix.costar.sfu.ca/svn/icGREP/}
578\end{itemize}
579
580\end{frame}
581
582
583
584\end{document}
Note: See TracBrowser for help on using the repository browser.