Changeset 3149 for docs


Ignore:
Timestamp:
May 16, 2013, 5:21:10 PM (6 years ago)
Author:
ksherdy
Message:

Added references to additional papers, Agrep, Shift-or. Text updates.

Location:
docs/Working/re
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • docs/Working/re/re-main.aux

    r3126 r3149  
    11\relax
     2\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}}
     3\newlabel{Introduction}{{1}{1}}
    24\citation{aho2007}
    35\citation{abou-assaleh2004}
    46\citation{navarro2000}
    5 \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}}
    6 \newlabel{Introduction}{{1}{1}}
     7\citation{cameron2008high}
     8\citation{cameron2009parallel}
     9\citation{cameron2011parallel}
     10\citation{lin2012parabix}
    711\citation{kleene1951}
    812\citation{thompson1968}
    9 \@writefile{toc}{\contentsline {section}{\numberline {2}Background}{2}}
    10 \newlabel{Background}{{2}{2}}
     13\citation{boyer1977fast}
     14\citation{navarro2002flexible}
     15\@writefile{toc}{\contentsline {section}{\numberline {2}Basic Concepts}{3}}
     16\newlabel{Basic Concepts}{{2}{3}}
     17\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Notation}{3}}
     18\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Regular Expressions}{3}}
     19\@writefile{toc}{\contentsline {section}{\numberline {3}Background}{3}}
     20\newlabel{Background}{{3}{3}}
     21\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Classical Methods}{3}}
     22\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Regular Expression and Finite Automata}{3}}
     23\citation{wu1992fast}
     24\citation{navarro1998bit}
     25\citation{navarro1998bit}
    1126\bibstyle{acm}
    1227\bibdata{reference}
    1328\bibcite{abou-assaleh2004}{1}
    1429\bibcite{aho2007}{2}
    15 \bibcite{kleene1951}{3}
    16 \bibcite{navarro2000}{4}
    17 \bibcite{thompson1968}{5}
    18 \@writefile{toc}{\contentsline {section}{\numberline {3}Parallel Bitwise Data Streams}{3}}
    19 \newlabel{Parallel Bitwise Data Streams}{{3}{3}}
    20 \@writefile{toc}{\contentsline {section}{\numberline {4}Compiler Technology}{3}}
    21 \newlabel{Compiler Technology}{{4}{3}}
    22 \@writefile{toc}{\contentsline {section}{\numberline {5}Methodology}{3}}
    23 \newlabel{Methodology}{{5}{3}}
    24 \@writefile{toc}{\contentsline {section}{\numberline {6}Experimental Results}{3}}
    25 \newlabel{Experimental Results}{{6}{3}}
    26 \@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{3}}
    27 \newlabel{Conclusion}{{7}{3}}
     30\bibcite{boyer1977fast}{3}
     31\bibcite{cameron2009parallel}{4}
     32\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Bit-parallel Simulation of Automata}{4}}
     33\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Software Tools}{4}}
     34\@writefile{toc}{\contentsline {section}{\numberline {4}Bit-parallel Data Streams}{4}}
     35\newlabel{Bit-parallel Data Streams}{{4}{4}}
     36\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Match Star}{4}}
     37\@writefile{toc}{\contentsline {section}{\numberline {5}Compiler Technology}{4}}
     38\newlabel{Compiler Technology}{{5}{4}}
     39\@writefile{toc}{\contentsline {section}{\numberline {6}Methodology}{4}}
     40\newlabel{Methodology}{{6}{4}}
     41\@writefile{toc}{\contentsline {section}{\numberline {7}Experimental Results}{4}}
     42\newlabel{Experimental Results}{{7}{4}}
     43\@writefile{toc}{\contentsline {section}{\numberline {8}Conclusion}{4}}
     44\newlabel{Conclusion}{{8}{4}}
     45\bibcite{cameron2011parallel}{5}
     46\bibcite{cameron2008high}{6}
     47\bibcite{kleene1951}{7}
     48\bibcite{lin2012parabix}{8}
     49\bibcite{navarro2000}{9}
     50\bibcite{navarro1998bit}{10}
     51\bibcite{navarro2002flexible}{11}
     52\bibcite{thompson1968}{12}
     53\bibcite{wu1992fast}{13}
  • docs/Working/re/re-main.bbl

    r3126 r3149  
    1 \begin{thebibliography}{1}
     1\begin{thebibliography}{10}
    22
    33\bibitem{abou-assaleh2004}
     
    1111\newblock Addison Wesley, 2007.
    1212
     13\bibitem{boyer1977fast}
     14{\sc Boyer, R.~S., and Moore, J.~S.}
     15\newblock A fast string searching algorithm.
     16\newblock {\em Communications of the ACM 20}, 10 (1977), 762--772.
     17
     18\bibitem{cameron2009parallel}
     19{\sc Cameron, R., Herdy, K., and Amiri, E.}
     20\newblock Parallel bit stream technology as a foundation for xml parsing
     21  performance.
     22\newblock In {\em International Symposium on Processing XML Efficiently:
     23  Overcoming Limits on Space, Time, or Bandwidth\/} (2009), vol.~8.
     24
     25\bibitem{cameron2011parallel}
     26{\sc Cameron, R.~D., Amiri, E., Herdy, K.~S., Lin, D., Shermer, T.~C., and
     27  Popowich, F.~P.}
     28\newblock Parallel scanning with bitstream addition: An xml case study.
     29\newblock In {\em Euro-Par 2011 Parallel Processing}. Springer, 2011,
     30  pp.~2--13.
     31
     32\bibitem{cameron2008high}
     33{\sc Cameron, R.~D., Herdy, K.~S., and Lin, D.}
     34\newblock High performance xml parsing using parallel bit stream technology.
     35\newblock In {\em Proceedings of the 2008 conference of the center for advanced
     36  studies on collaborative research: meeting of minds\/} (2008), ACM, p.~17.
     37
    1338\bibitem{kleene1951}
    1439{\sc Kleene, S.~C.}
    1540\newblock Representation of events in nerve nets and finite automata.
     41
     42\bibitem{lin2012parabix}
     43{\sc Lin, D., Medforth, N., Herdy, K.~S., Shriraman, A., and Cameron, R.}
     44\newblock Parabix: Boosting the efficiency of text processing on commodity
     45  processors.
     46\newblock In {\em High Performance Computer Architecture (HPCA), 2012 IEEE 18th
     47  International Symposium on\/} (2012), IEEE, pp.~1--12.
    1648
    1749\bibitem{navarro2000}
     
    2052\newblock {\em Software Practice and Experience (SPE 31\/} (2000), 2001.
    2153
     54\bibitem{navarro1998bit}
     55{\sc Navarro, G., and Raffinot, M.}
     56\newblock A bit-parallel approach to suffix automata: Fast extended string
     57  matching.
     58\newblock In {\em Combinatorial Pattern Matching\/} (1998), Springer,
     59  pp.~14--33.
     60
     61\bibitem{navarro2002flexible}
     62{\sc Navarro, G., and Raffinot, M.}
     63\newblock {\em Flexible pattern matching in strings: practical on-line search
     64  algorithms for texts and biological sequences}.
     65\newblock Cambridge University Press, 2002.
     66
    2267\bibitem{thompson1968}
    2368{\sc Thompson, K.}
     
    2570\newblock {\em Communications of the ACM 11}, 6 (1968), 419--422.
    2671
     72\bibitem{wu1992fast}
     73{\sc Wu, S., and Manber, U.}
     74\newblock Fast text searching: allowing errors.
     75\newblock {\em Communications of the ACM 35}, 10 (1992), 83--91.
     76
    2777\end{thebibliography}
  • docs/Working/re/re-main.blg

    r3126 r3149  
    55Warning--empty institution in abou-assaleh2004
    66Warning--empty journal in kleene1951
    7 You've used 5 entries,
     7You've used 13 entries,
    88            2253 wiz_defined-function locations,
    9             556 strings with 4444 characters,
    10 and the built_in function-call counts, 1249 in all, are:
    11 = -- 116
    12 > -- 50
     9            606 strings with 5962 characters,
     10and the built_in function-call counts, 4215 in all, are:
     11= -- 398
     12> -- 191
    1313< -- 0
    14 + -- 21
    15 - -- 16
    16 * -- 86
    17 := -- 225
    18 add.period$ -- 14
    19 call.type$ -- 5
    20 change.case$ -- 23
     14+ -- 81
     15- -- 66
     16* -- 305
     17:= -- 692
     18add.period$ -- 39
     19call.type$ -- 13
     20change.case$ -- 71
    2121chr.to.int$ -- 0
    22 cite$ -- 7
    23 duplicate$ -- 48
    24 empty$ -- 104
    25 format.name$ -- 16
    26 if$ -- 250
     22cite$ -- 15
     23duplicate$ -- 176
     24empty$ -- 353
     25format.name$ -- 66
     26if$ -- 885
    2727int.to.chr$ -- 0
    28 int.to.str$ -- 5
    29 missing$ -- 5
    30 newline$ -- 27
    31 num.names$ -- 10
    32 pop$ -- 23
     28int.to.str$ -- 13
     29missing$ -- 14
     30newline$ -- 67
     31num.names$ -- 26
     32pop$ -- 89
    3333preamble$ -- 1
    34 purify$ -- 18
     34purify$ -- 59
    3535quote$ -- 0
    36 skip$ -- 26
     36skip$ -- 102
    3737stack$ -- 0
    38 substring$ -- 58
    39 swap$ -- 8
     38substring$ -- 210
     39swap$ -- 42
    4040text.length$ -- 0
    4141text.prefix$ -- 0
    4242top$ -- 0
    43 type$ -- 18
     43type$ -- 48
    4444warning$ -- 2
    45 while$ -- 12
    46 width$ -- 6
    47 write$ -- 49
     45while$ -- 42
     46width$ -- 15
     47write$ -- 134
    4848(There were 2 warnings)
  • docs/Working/re/re-main.log

    r3126 r3149  
    1 This is pdfTeX, Version 3.1415926-1.40.10 (TeX Live 2009/Debian) (format=pdflatex 2012.10.7)  10 MAY 2013 15:24
     1This is pdfTeX, Version 3.1415926-1.40.10 (TeX Live 2009/Debian) (format=pdflatex 2013.1.29)  16 MAY 2013 17:20
    22entering extended mode
    33 %&-line parsing enabled.
     
    249249\openout1 = `re-main.aux'.
    250250
    251 LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 20.
    252 LaTeX Font Info:    ... okay on input line 20.
    253 LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 20.
    254 LaTeX Font Info:    ... okay on input line 20.
    255 LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 20.
    256 LaTeX Font Info:    ... okay on input line 20.
    257 LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 20.
    258 LaTeX Font Info:    ... okay on input line 20.
    259 LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 20.
    260 LaTeX Font Info:    ... okay on input line 20.
    261 LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 20.
    262 LaTeX Font Info:    ... okay on input line 20.
    263 LaTeX Font Info:    External font `cmex10' loaded for size
    264 (Font)              <12> on input line 23.
    265 LaTeX Font Info:    External font `cmex10' loaded for size
    266 (Font)              <8> on input line 23.
    267 LaTeX Font Info:    External font `cmex10' loaded for size
    268 (Font)              <6> on input line 23.
    269 LaTeX Font Info:    External font `cmex10' loaded for size
    270 (Font)              <7> on input line 52.
    271 LaTeX Font Info:    External font `cmex10' loaded for size
    272 (Font)              <5> on input line 52.
     251LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 17.
     252LaTeX Font Info:    ... okay on input line 17.
     253LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 17.
     254LaTeX Font Info:    ... okay on input line 17.
     255LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 17.
     256LaTeX Font Info:    ... okay on input line 17.
     257LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 17.
     258LaTeX Font Info:    ... okay on input line 17.
     259LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 17.
     260LaTeX Font Info:    ... okay on input line 17.
     261LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 17.
     262LaTeX Font Info:    ... okay on input line 17.
     263LaTeX Font Info:    External font `cmex10' loaded for size
     264(Font)              <12> on input line 20.
     265LaTeX Font Info:    External font `cmex10' loaded for size
     266(Font)              <8> on input line 20.
     267LaTeX Font Info:    External font `cmex10' loaded for size
     268(Font)              <6> on input line 20.
     269LaTeX Font Info:    External font `cmex10' loaded for size
     270(Font)              <7> on input line 65.
     271LaTeX Font Info:    External font `cmex10' loaded for size
     272(Font)              <5> on input line 65.
    273273 [1
    274274
    275275{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}]
    276 
    277 LaTeX Warning: Reference `Bitwise Parallel Data Streams' on page 2 undefined on
    278  input line 100.
    279 
    280 [2] (./re-main.bbl) [3] (./re-main.aux)
    281 
    282 LaTeX Warning: There were undefined references.
    283 
    284  )
     276LaTeX Font Info:    Try loading font information for OMS+cmr on input line 123.
     277
     278
     279(/usr/share/texmf-texlive/tex/latex/base/omscmr.fd
     280File: omscmr.fd 1999/05/25 v2.5h Standard LaTeX font definitions
     281)
     282LaTeX Font Info:    Font shape `OMS/cmr/m/n' in size <10> not available
     283(Font)              Font shape `OMS/cmsy/m/n' tried instead on input line 123.
     284 [2] [3] (./re-main.bbl
     285[4]) [5] (./re-main.aux) )
    285286Here is how much of TeX's memory you used:
    286  463 strings out of 493848
    287  4615 string characters out of 1152823
     287 487 strings out of 493848
     288 5012 string characters out of 1152823
    288289 55050 words of memory out of 3000000
    289  3791 multiletter control sequences out of 15000+50000
    290  8842 words of font info for 31 fonts, out of 3000000 for 9000
     290 3812 multiletter control sequences out of 15000+50000
     291 9186 words of font info for 32 fonts, out of 3000000 for 9000
    291292 714 hyphenation exceptions out of 8191
    292  23i,6n,19p,165b,199s stack positions out of 5000i,500n,10000p,200000b,50000s
    293 </usr/share/texmf-texlive/fonts/type1/public/amsfonts/cm/cmbx10.pfb></usr/sha
    294 re/texmf-texlive/fonts/type1/public/amsfonts/cm/cmbx12.pfb></usr/share/texmf-te
    295 xlive/fonts/type1/public/amsfonts/cm/cmbx9.pfb></usr/share/texmf-texlive/fonts/
    296 type1/public/amsfonts/cm/cmcsc10.pfb></usr/share/texmf-texlive/fonts/type1/publ
    297 ic/amsfonts/cm/cmmi7.pfb></usr/share/texmf-texlive/fonts/type1/public/amsfonts/
    298 cm/cmr10.pfb></usr/share/texmf-texlive/fonts/type1/public/amsfonts/cm/cmr12.pfb
    299 ></usr/share/texmf-texlive/fonts/type1/public/amsfonts/cm/cmr17.pfb></usr/share
    300 /texmf-texlive/fonts/type1/public/amsfonts/cm/cmr7.pfb></usr/share/texmf-texliv
    301 e/fonts/type1/public/amsfonts/cm/cmr9.pfb></usr/share/texmf-texlive/fonts/type1
    302 /public/amsfonts/cm/cmti10.pfb>
    303 Output written on re-main.pdf (3 pages, 152136 bytes).
     293 23i,6n,19p,193b,209s stack positions out of 5000i,500n,10000p,200000b,50000s
     294</usr/share/texmf-texlive/fonts/type1/public/amsfonts
     295/cm/cmbx12.pfb></usr/share/texmf-texlive/fonts/type1/public/amsfonts/cm/cmbx9.p
     296fb></usr/share/texmf-texlive/fonts/type1/public/amsfonts/cm/cmcsc10.pfb></usr/s
     297hare/texmf-texlive/fonts/type1/public/amsfonts/cm/cmmi10.pfb></usr/share/texmf-
     298texlive/fonts/type1/public/amsfonts/cm/cmmi7.pfb></usr/share/texmf-texlive/font
     299s/type1/public/amsfonts/cm/cmr10.pfb></usr/share/texmf-texlive/fonts/type1/publ
     300ic/amsfonts/cm/cmr12.pfb></usr/share/texmf-texlive/fonts/type1/public/amsfonts/
     301cm/cmr17.pfb></usr/share/texmf-texlive/fonts/type1/public/amsfonts/cm/cmr7.pfb>
     302</usr/share/texmf-texlive/fonts/type1/public/amsfonts/cm/cmr9.pfb></usr/share/t
     303exmf-texlive/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/share/texmf-texliv
     304e/fonts/type1/public/amsfonts/cm/cmti10.pfb></usr/share/texmf-texlive/fonts/typ
     305e1/public/amsfonts/cm/cmti9.pfb>
     306Output written on re-main.pdf (5 pages, 191559 bytes).
    304307PDF statistics:
    305  56 PDF objects out of 1000 (max. 8388607)
     308 70 PDF objects out of 1000 (max. 8388607)
    306309 0 named destinations out of 1000 (max. 500000)
    307310 1 words of extra memory for PDF output out of 10000 (max. 10000000)
  • docs/Working/re/re-main.tex

    r3138 r3149  
    33
    44%opening
    5 \title{Fast Regular Expression Matching using Parallel Bitstreams}
     5\title{Fast Regular Expression Matching with Bit-parallel Data Streams}
    66\author{
    77{Robert D. Cameron} \\
     
    2222\begin{abstract}
    2323
    24 A parallel regular expression matching method is introduced and studied in
    25 comparison with software tools designed for efficient on-line text searching such
    26 as the {\em grep} family.
    27 
     24An parallel regular expression matching pattern method is
     25introduced and compared with the state of the art in software tools designed for
     26efficient on-line search. %such as the {\em grep} family pattern matching tools.
    2827The method is based on the concept of bit-parallel data streams,
    2928in which parallel streams of bits are formed such
     
    3231
    3332An implementation of the method in the form of a regular expression
    34 compiler is discussed. The compiler accepts a regular expression and produces
    35 sequences of arbitrary-length bit-parallel equations.
    36 Bit-parallel equations are then transformed
    37 into a low-level C-based implementation.
    38 
    39 The C-based program accepts the text to be searched and
    40 signals a match each time the text
    41 matches the compiled regular expression.
    42 
    43 
    44 These low-level implementations take advantage of
     33compiler is discussed. The compiler accepts a regular expression and
     34forms unbounded bit-parallel data stream operations. Bit-parallel operations
     35are then transformed into a low-level C-based implementation for compilation into native
     36pattern matching applications. These low-level C-based implementations take advantage of
    4537the SIMD (single-instruction multiple-data) capabilities of commodity
    46 processors to yield a dramatic speed-up over
    47 traditional byte-at-a-time approaches.
    48 
    49 On processors supporting W-bit
    50 addition operations, the method processes W source characters 
     38processors to yield a dramatic speed-up over traditional byte-at-a-time approaches.
     39On processors supporting W-bit addition operations, the method processes W source characters
    5140in parallel and performs up to W finite state transitions per clock cycle.
    5241
    53 Additional parallelism is achieved through the introduction a new parallel
    54 scanning primitive termed {\em Match Star}. % which eliminates backtracking in Kleene Closure.
    55 
    56 
    57 We demonstrate the features and efficiency of our bit-parallel pattern
    58 matching appoach by searching text data sets for lines matching a regular expression.
    59 
    60 We evaluate the performance of our method against the widely known grep implemenations,
    61 {\em Gnu grep}, {\em agrep}, {\em nr-grep}, as well as {\em Google's re2} regular expression engine.
    62 
    63 Performance results show a dramatic speed-up over the traditional and state-of-the-art alternatives.
    64 
     42We introduce a new bit-parallel scanning primitive, {\em Match Star}.
     43which performs parallel Kleene closure over character classes
     44and without eliminates backtracking. % expand and rephrase description of Match Star
     45
     46We evaluate the performance of our method in comparison with several widely known {\em grep}
     47family implemenations, {\em Gnu grep}, {\em agrep}, {\em nr-grep},
     48and regular expression engines such as {\em Google's re2}.
     49Performance results are analyzed using the performance monitoring counters
     50of commodity hardware. Overall, our results demonstrate a dramatic speed-up
     51over publically available alternatives.
    6552
    6653\end{abstract}
     
    7158
    7259Regular expresssion matching is an extensively studied problem with application to
    73 numerous application domains. A multitude
    74 of algorithms and software tools have been developed to the address the particular
    75 demands of the various application domains.
     60text processing and bioinformatics and with numerous algorithms
     61and software tools developed to the address the particular
     62processing demands. % reword
    7663
    7764The pattern matching problem can be
     
    8168slightly different output such as the line that matches the pattern.
    8269
    83 A pattern P can be a simple string, but it can also be, a regular expression.
    84 A regular expression, is an expression that specifies a set of strings.
    85 A regular expression is composed of (i) simple strings and (ii) the
     70A pattern P can be a simple string, but it can also be a regular expression.
     71A regular expression, is an expression that specifies a set of strings
     72and is composed of (i) simple strings and (ii) the
    8673union, concatenation and Kleene closure of other regular expressions.
    8774To avoid parentheses it is assumed that the Kleene star has the highest priority,
     
    118105
    119106The remainder of this paper is organized as follows.
    120 Section~\ref{Background} presents background material on classic
    121 regular expression pattern matching techniques and provides insight into the
    122 efficiency of traditional regular expression software tools. Next,
    123 Section~\ref{Bit-parallel Data Streams} describes our parallel regular expression matching techniques.
     107Section~\ref{Basic Concepts} introduces the notations and basic concepts used throughout this paper.
     108Section~\ref{Background} presents background material on classical and state-of-the-art approaches
     109to high performance regular expression matching. In addition, this section provides insight into the
     110efficiency of traditional on-line pattern matching tools. Next,
     111Section~\ref{Bit-parallel Data Streams} describes our parallel regular expression matching techniques.
    124112Section~\ref{Compiler Technology} presents our software toolchain for constructing pattern matching applications.
    125113Section~\ref{Methodology} describes the evaluation framework and Section~\ref{Experimental Results}
    126 presents a detailed performance analysis of our data parallel bitstream techniques against
    127 Gnu grep, agrep, and nr-grep.
    128 Section~\ref{Conclusion} concludes the paper.
     114presents a detailed performance analysis of our data parallel bitstream techniques in comparison to
     115Gnu grep, agrep, and nr-grep, and re2. Section~\ref{Conclusion} concludes the paper.
     116
     117The fundamental contribution of this paper is fully general approach to regular expression matching
     118using bit-parallel data streams. The algorithmic aspects of this paper build upon
     119the fundamental concepts of our previous work
     120in some cases \cite{cameron2008high, cameron2009parallel, cameron2011parallel, lin2012parabix}.
     121 Individual contributions include:
     122\begin{itemize}
     123\item compilation of regular expressions into unbounded bit-parallel data stream equations;
     124\item documentation of character classes compilation into bit-parallel character class data streams;
     125% \item methods to select optimal subpatterns;
     126\item bit-parallel and backtrack-free scanning primitive termed Match Star;
     127\item bit-parallel data stream support for unicode.
     128\end{itemize}
     129
     130\section{Basic Concepts}
     131\label{Basic Concepts}
     132We define the notations and basic concepts used throughout this paper.
     133
     134\subsection{Notation}
     135We use the following notations. Let $P=p_{1}p_{2}\ldots{}p_{m}$
     136be a pattern of length m and $T=t_{1}t_{2}\ldots{}t_{n}$
     137be a text of length n defined over a finite alphabet $sigma$ of size $alpha$.
     138The task of regular expression matching
     139is to find all the text positions of T that follow an occurrence of P. We use
     140C notations to represent bitwise operations $\lnot{}$, $\lor{}$, $\land{}$, $\oplus{}$
     141represent bitwise NOT, OR, AND, XOR, respectively. Operators $\ll{}k$, and $\gg{}$ represent
     142logical left shift, and logical right shift, respectively and are further modulated by
     143the number of bit positions a given value shall be shifted by, for example ``shift left by n''.
     144Vacant bit-positions are filled in with zeroes.
     145
     146\subsection{Regular Expressions}
     147
     148% TODO - Define in terms of recursive defintion, include character classes, bounded (repeated) repetition
    129149
    130150\section{Background}
    131151\label{Background}
    132152%\input{background.tex}
     153
     154\subsection{Classical Methods}
     155
     156\subsection{Regular Expression and Finite Automata}
    133157
    134158The origins of regular expression matching date back to automata theory
     
    149173based on the current state and the next character read.
    150174
    151 
    152 Bit-parallel simulation of automata.
     175The Boyer-Moore family of algorithms \cite{boyer1977fast} , Horspool, ... skip characters
     176
     177Suffix automata (BDM)
    153178
    154179The ideas presented up to now aim at a good implementation of the automa-
     
    157182for the regular expression to match.
    158183
    159 
    160 Suffix automata (BDM/BNDM)
    161 
    162 Skip characters pattern length (occurence frequency and length).
    163 
    164 
    165 \section{Bit-parallel Data Streams}
    166 \label{Bit-parallel data streams}
    167 
    168 The advantage of the parallel bit stream representation is that we can
    169 use the 128-bit SIMD registers commonly found on commodity processors
    170 (e.g. SSE on Intel) to process 128 byte positions at a time using
    171 bitwise logic, shifting and other operations.
    172 
    173 Skip characters register width.
     184\subsection{Bit-parallel Simulation of Automata}
     185
     186Define bit-parallelism \cite{navarro2002flexible}
     187
     188Shift-Or \cite{wu1992fast}
     189
     190Backward Dawg Matching (BDM) \cite{navarro1998bit}
     191
     192Bit-parallel suffix automata (Backward Non-Deterministic Dawg Matching (BNDM) \cite{navarro1998bit} algorithm)
     193
     194Ability to skip characters pattern length (occurence frequency and length).
    174195
    175196\subsection{Software Tools}
     
    185206%http://pages.cs.wisc.edu/~mdant/cs520_4.html
    186207
    187 \subsection{Mask Star}
     208%Given a regular expression R and a test T the regular expression matching
     209%problem finds all ending position of substrings in Q that matches a string in
     210%the language denoted by R.
     211%The behaviour of Gnu grep, agrep, and nr-grep are differ ...
     212%Gnu grep
     213%agrep
     214%nr-grep
     215%re2
     216
     217
     218\section{Bit-parallel Data Streams}
     219\label{Bit-parallel Data Streams}
     220
     221The bit-parallel data streams use the wide
     222SIMD registers commonly found on commodity processors
     223to process  byte positions at a time using
     224bitwise logic, shifting and other operations.
     225
     226A signficant advantage of the bit-parallel
     227data stream method over other
     228pattern matching methods that rely on
     229bit-parallel automata simulation
     230is the potential to skip full register width
     231number of characters in low occurence frequency text. % reword
     232
     233
     234Skip characters register width.
     235
     236\subsection{Match Star}
    188237
    189238%Wikipedia
     
    198247%\input{methodology.tex}
    199248
    200 We compare the performance of our parallel \bitstream{} techniques against
    201 Gnu grep, agrep, and nr-grep.
    202 
    203 Given a regular expression R and a test T the regular expression matching
    204 problem finds all ending position of substrings in Q that matches a string in
    205 the language denoted by R.
    206 
    207 The behaviour of Gnu grep, agrep, and nr-grep are differ in that
    208 
    209 Gnu grep
    210 
    211 agrep
    212 
    213 nr-grep
     249We compare the performance of our bit-parallel data stream techniques against
     250Gnu grep, agrep, nr-grep, and re2.
    214251
    215252
  • docs/Working/re/re-main.tex.backup

    r3138 r3149  
    33
    44%opening
    5 \title{Fast Regular Expression Matching using Parallel Bitstreams}
     5\title{Fast Regular Expression Matching with Bit-parallel Data Streams}
    66\author{
    77{Robert D. Cameron} \\
     
    2222\begin{abstract}
    2323
    24 A parallel regular expression matching method is introduced and studied in
    25 comparison with software tools designed for efficient on-line text searching such
    26 as the {\em grep} family.
    27 
     24An parallel regular expression matching pattern method is
     25introduced and compared with the state of the art in software tools designed for
     26efficient on-line search. %such as the {\em grep} family pattern matching tools.
    2827The method is based on the concept of bit-parallel data streams,
    2928in which parallel streams of bits are formed such
     
    3231
    3332An implementation of the method in the form of a regular expression
    34 compiler is discussed. The compiler accepts a regular expression and produces
    35 sequences of arbitrary-length bit-parallel equations.
    36 Bit-parallel equations are then transformed
    37 into a low-level C-based implementation.
    38 
    39 The C-based program accepts the text to be searched and
    40 signals a match each time the text
    41 matches the compiled regular expression.
    42 
    43 
    44 These low-level implementations take advantage of
     33compiler is discussed. The compiler accepts a regular expression and
     34forms unbounded bit-parallel data stream operations. Bit-parallel operations
     35are then transformed into a low-level C-based implementation for compilation into native
     36pattern matching applications. These low-level C-based implementations take advantage of
    4537the SIMD (single-instruction multiple-data) capabilities of commodity
    46 processors to yield a dramatic speed-up over
    47 traditional byte-at-a-time approaches.
    48 
    49 On processors supporting W-bit
    50 addition operations, the method processes W source characters 
     38processors to yield a dramatic speed-up over traditional byte-at-a-time approaches.
     39On processors supporting W-bit addition operations, the method processes W source characters
    5140in parallel and performs up to W finite state transitions per clock cycle.
    5241
    53 Additional parallelism is achieved through the introduction a new parallel
    54 scanning primitive termed {\em Match Star}. % which eliminates backtracking in Kleene Closure.
    55 
    56 
    57 We demonstrate the features and efficiency of our bit-parallel pattern
    58 matching appoach by searching text data sets for lines matching a regular expression.
    59 
    60 We evaluate the performance of our method against the widely known grep implemenations,
    61 {\em Gnu grep}, {\em agrep}, {\em nr-grep}, as well as {\em Google's re2} regular expression engine.
    62 
    63 Performance results show a dramatic speed-up over the traditional and state-of-the-art alternatives.
    64 
     42We introduce a new bit-parallel scanning primitive, {\em Match Star}.
     43which performs parallel Kleene closure over character classes
     44and without eliminates backtracking. % expand and rephrase description of Match Star
     45
     46We evaluate the performance of our method in comparison with several widely known {\em grep}
     47family implemenations, {\em Gnu grep}, {\em agrep}, {\em nr-grep},
     48and regular expression engines such as {\em Google's re2}.
     49Performance results are analyzed using the performance monitoring counters
     50of commodity hardware. Overall, our results demonstrate a dramatic speed-up
     51over publically available alternatives.
    6552
    6653\end{abstract}
     
    7158
    7259Regular expresssion matching is an extensively studied problem with application to
    73 numerous application domains. A multitude
    74 of algorithms and software tools have been developed to the address the particular
    75 demands of the various application domains.
     60text processing and bioinformatics and with numerous algorithms
     61and software tools developed to the address the particular
     62processing demands. % reword
    7663
    7764The pattern matching problem can be
     
    8168slightly different output such as the line that matches the pattern.
    8269
    83 A pattern P can be a simple string, but it can also be, a regular expression.
    84 A regular expression, is an expression that specifies a set of strings.
    85 A regular expression is composed of (i) simple strings and (ii) the
     70A pattern P can be a simple string, but it can also be a regular expression.
     71A regular expression, is an expression that specifies a set of strings
     72and is composed of (i) simple strings and (ii) the
    8673union, concatenation and Kleene closure of other regular expressions.
    8774To avoid parentheses it is assumed that the Kleene star has the highest priority,
     
    118105
    119106The remainder of this paper is organized as follows.
    120 Section~\ref{Background} presents background material on classic
    121 regular expression pattern matching techniques and provides insight into the
    122 efficiency of traditional regular expression software tools. Next,
    123 Section~\ref{Bit-parallel Data Streams} describes our parallel regular expression matching techniques.
     107Section~\ref{Basic Concepts} introduces the notations and basic concepts used throughout this paper.
     108Section~\ref{Background} presents background material on classical and state-of-the-art approaches
     109to high performance regular expression matching. In addition, this section provides insight into the
     110efficiency of traditional on-line pattern matching tools. Next,
     111Section~\ref{Bit-parallel Data Streams} describes our parallel regular expression matching techniques.
    124112Section~\ref{Compiler Technology} presents our software toolchain for constructing pattern matching applications.
    125113Section~\ref{Methodology} describes the evaluation framework and Section~\ref{Experimental Results}
    126 presents a detailed performance analysis of our data parallel bitstream techniques against
    127 Gnu grep, agrep, and nr-grep.
    128 Section~\ref{Conclusion} concludes the paper.
     114presents a detailed performance analysis of our data parallel bitstream techniques in comparison to
     115Gnu grep, agrep, and nr-grep, and re2. Section~\ref{Conclusion} concludes the paper.
     116
     117The fundamental contribution of this paper is fully general approach to regular expression matching
     118using bit-parallel data streams. The algorithmic aspects of this paper build upon
     119the fundamental concepts of our previous work
     120in some cases \cite{cameron2008high, cameron2009parallel, cameron2011parallel, lin2012parabix}.
     121 Individual contributions include:
     122\begin{itemize}
     123\item compilation of regular expressions into unbounded bit-parallel data stream equations;
     124\item documentation of character classes compilation into bit-parallel character class data streams;
     125% \item methods to select optimal subpatterns;
     126\item bit-parallel and backtrack-free scanning primitive termed Match Star;
     127\item bit-parallel data stream support for unicode.
     128\end{itemize}
     129
     130\section{Basic Concepts}
     131\label{Basic Concepts}
     132We define the notations and basic concepts used throughout this paper.
     133
     134\subsection{Notation}
     135We use the following notations. Let $P=p_{1}p_{2}\ldots{}p_{m}$
     136be a pattern of length m and $T=t_{1}t_{2}\ldots{}t_{n}$
     137be a text of length n defined over a finite alphabet $sigma$ of size $alpha$.
     138The task of regular expression matching
     139is to find all the text positions of T that follow an occurrence of P. We use
     140C notations to represent bitwise operations $\lnot{}$, $\lor{}$, $\land{}$, $\oplus{}$, $\ll{}$, and $\lgg{}$
     141represent bitwise NOT, OR, AND, XOR, k-bit left shift and k-bit right shift, respectively.
     142
     143\subsection{Regular Expressions}
     144
     145% TODO
    129146
    130147\section{Background}
    131148\label{Background}
    132149%\input{background.tex}
     150
     151\subsection{Classical Methods}
     152
     153\subsection{Regular Expression and Finite Automata}
    133154
    134155The origins of regular expression matching date back to automata theory
     
    138159Following Thompson's approach, a regular expression of length m is first converted
    139160to an NFA with O(m) nodes. It is possible to search a text of length n using the
    140 NFA directly in O(mn) worst case time. Alternatively, a more efficient choice
     161NFA directly in O(mn) worst case time. Often, a more efficient choice
    141162is to convert the NFA into a DFA. A DFA has only a single active state and
    142163allows to search the text at O(n) worst-case optimal. It is well known that the
     
    146167Thompson's original work marked the beginning of a long line of
    147168regular expression implementations that
    148 process an input string, character-by-character, and that change based on the current state and the
    149 character read. Thompson created the first grep utility as a standalone adaptation
    150 of the regular expression parser he had written for the UNIX ed utility. In 1976,
    151 Aho improved upon Thompson's implementation that with a DFA-based implementation called egrep.
    152 
    153 Bit-parallel simulation of automata.
    154 
    155 Suffix automata (BDM/BNDM)
     169process an input string, character-at-a-time, and that transition patterm matching state
     170based on the current state and the next character read.
     171
     172The Boyer-Moore family of algorithms \cite{boyer1977fast} , Horspool, ... skip characters
     173
     174Suffix automata (BDM)
     175
     176The ideas presented up to now aim at a good implementation of the automa-
     177ton, but they must inspect all the text characters. In many cases, however, the
     178regular expression involves sets of relatively long substrings that must appear
     179for the regular expression to match.
     180
     181\subsection{Bit-parallel Simulation of Automata}
     182
     183Define bit-parallelism \cite{navarro2002flexible}
     184
     185Shift-Or \cite{}
     186
     187Backward Dawg Matching (BDM) \cite{navarro1998bit}
     188
     189Bit-parallel suffix automata (Backward Non-Deterministic Dawg Matching (BNDM) \cite{navarro1998bit} algorithm)
    156190
    157191Skip characters pattern length (occurence frequency and length).
     
    159193
    160194\section{Bit-parallel Data Streams}
    161 \label{Bit-parallel data streams}
    162 
    163 The advantage of the parallel bit stream representation is that we can
    164 use the 128-bit SIMD registers commonly found on commodity processors
    165 (e.g. SSE on Intel) to process 128 byte positions at a time using
     195\label{Bit-parallel Data Streams}
     196
     197The bit-parallel data streams use the wide
     198SIMD registers commonly found on commodity processors
     199to process byte positions at a time using
    166200bitwise logic, shifting and other operations.
    167201
     202A signficant advantage of the bit-parallel
     203data stream method over other
     204pattern matching methods that rely on
     205bit-parallel automata simulation
     206is the potential to skip full register width
     207number of characters in low occurence frequency text. % reword
     208
     209
    168210Skip characters register width.
    169211
    170 \subsection{Mask Star}
     212\subsection{Software Tools}
     213
     214%Thompson created the first grep (UNIX grep) as a standalone adaptation
     215%of the regular expression parser he had written for the UNIX ed utility.
     216%In 1976, Aho improved upon Thompson's implementation that with a DFA-based implementation called egrep.
     217%Egrep was faster then grep for simple patterns but for more complex searches it lagged behind because of the
     218%time it took to build a complete finite automaton for the regular expression before it could even start searching.
     219%Since grep used a nondeterministic finite automaton it took less time to build the state machine but more time
     220%to evaluate strings with it. Aho later used a technique called cached lazy evaluation to improve the performance of egrep.
     221%It took zero set-up time and just one additional test in the inner loop.
     222%http://pages.cs.wisc.edu/~mdant/cs520_4.html
     223
     224%Given a regular expression R and a test T the regular expression matching
     225%problem finds all ending position of substrings in Q that matches a string in
     226%the language denoted by R.
     227%The behaviour of Gnu grep, agrep, and nr-grep are differ ...
     228%Gnu grep
     229%agrep
     230%nr-grep
     231%re2
     232
     233\subsection{Match Star}
    171234
    172235%Wikipedia
     
    181244%\input{methodology.tex}
    182245
    183 We compare the performance of our parallel \bitstream{} techniques against
    184 Gnu grep, agrep, and nr-grep.
    185 
    186 Given a regular expression R and a test T the regular expression matching
    187 problem finds all ending position of substrings in Q that matches a string in
    188 the language denoted by R.
    189 
    190 The behaviour of Gnu grep, agrep, and nr-grep are differ in that
    191 
    192 Gnu grep
    193 
    194 agrep
    195 
    196 nr-grep
     246We compare the performance of our bit-parallel data stream techniques against
     247Gnu grep, agrep, nr-grep, and re2.
    197248
    198249
  • docs/Working/re/reference.bib

    r3126 r3149  
    3838    pages = {2001}
    3939}
     40
     41@inproceedings{navarro1998bit,
     42  title={A bit-parallel approach to suffix automata: Fast extended string matching},
     43  author={Navarro, Gonzalo and Raffinot, Mathieu},
     44  booktitle={Combinatorial Pattern Matching},
     45  pages={14--33},
     46  year={1998},
     47  organization={Springer}
     48}
     49
     50@book{navarro2002flexible,
     51  title={Flexible pattern matching in strings: practical on-line search algorithms for texts and biological sequences},
     52  author={Navarro, Gonzalo and Raffinot, Mathieu},
     53  year={2002},
     54  publisher={Cambridge University Press}
     55}
     56
     57@inproceedings{cameron2008high,
     58  title={High performance XML parsing using parallel bit stream technology},
     59  author={Cameron, Robert D and Herdy, Kenneth S and Lin, Dan},
     60  booktitle={Proceedings of the 2008 conference of the center for advanced studies on collaborative research: meeting of minds},
     61  pages={17},
     62  year={2008},
     63  organization={ACM}
     64}
     65
     66@inproceedings{cameron2009parallel,
     67  title={Parallel bit stream technology as a foundation for XML parsing performance},
     68  author={Cameron, Rob and Herdy, Ken and Amiri, Ehsan},
     69  booktitle={International Symposium on Processing XML Efficiently: Overcoming Limits on Space, Time, or Bandwidth},
     70  volume={8},
     71  year={2009}
     72}
     73
     74@incollection{cameron2011parallel,
     75  title={Parallel scanning with bitstream addition: An xml case study},
     76  author={Cameron, Robert D and Amiri, Ehsan and Herdy, Kenneth S and Lin, Dan and Shermer, Thomas C and Popowich, Fred P},
     77  booktitle={Euro-Par 2011 Parallel Processing},
     78  pages={2--13},
     79  year={2011},
     80  publisher={Springer}
     81}
     82
     83@inproceedings{lin2012parabix,
     84  title={Parabix: Boosting the efficiency of text processing on commodity processors},
     85  author={Lin, Dan and Medforth, Nigel and Herdy, Kenneth S and Shriraman, Arrvindh and Cameron, Rob},
     86  booktitle={High Performance Computer Architecture (HPCA), 2012 IEEE 18th International Symposium on},
     87  pages={1--12},
     88  year={2012},
     89  organization={IEEE}
     90}
     91
     92@article{boyer1977fast,
     93  title={A fast string searching algorithm},
     94  author={Boyer, Robert S and Moore, J Strother},
     95  journal={Communications of the ACM},
     96  volume={20},
     97  number={10},
     98  pages={762--772},
     99  year={1977},
     100  publisher={ACM}
     101}
     102
     103@article{wu1992agrep,
     104  title={‘Agrep—A Fast Approximate Pattern-Matching Tool},
     105  author={Wu, Sun and Manber, Udi},
     106  journal={Usenix Winter 1992},
     107  pages={153--162},
     108  year={1992}
     109}
     110
     111@article{wu1992fast,
     112  title={Fast text searching: allowing errors},
     113  author={Wu, Sun and Manber, Udi},
     114  journal={Communications of the ACM},
     115  volume={35},
     116  number={10},
     117  pages={83--91},
     118  year={1992},
     119  publisher={ACM}
     120}
Note: See TracChangeset for help on using the changeset viewer.