Changeset 3614 for proto/RE


Ignore:
Timestamp:
Jan 19, 2014, 8:49:23 AM (5 years ago)
Author:
cameron
Message:

Add regexp parser; move compiler into REcompile

Location:
proto/RE/Haskell
Files:
2 added
1 edited

Legend:

Unmodified
Added
Removed
  • proto/RE/Haskell/CanonicalRE.hs

    r3606 r3614  
    1 -- Module Canonical_RE is a research compiler for prototyping
    2 -- algorithms and ideas in regular expression processing using
    3 -- parallel bit streams and MatchStar.
    4 -- It defines a canonical representation for regular expressions
     1-- Module CanonicalRE defines a canonical representation for regular expressions
    52-- that uses a small number of alternative forms (e.g., combining
    63-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
     
    96-- Robert D. Cameron, 2013
    107
    11 module CanonicalRE (RE(..), RepLimit(..), compile) where
    12        
    13 import Data.Char
    14 import RunPablo
     8module CanonicalRE (RE(..), RepLimit(..)) where
    159
    1610-- RE is the data type for regular expressions
     
    3529-- Alt [] represents the empty set, i.e., matching nothing.
    3630
    37 --
    38 -- The Symbol Environment keeps track of the symbols generated
    39 -- during compilation, so that new unique symbols can be
    40 -- generated (gensym) as needed.   All symbols are formed from
    41 -- a string prefix and a numeric suffix.
    42 --
    43 -- For example, if the symbols "marker0", "marker1", "condition0"
    44 -- have been generated and no others have, the environment is
    45 -- [("marker", 1), ("condition", 0)]
    46 --
    47 type Env = [(String, Int)]
    4831
    49 gensym :: (Env, String) -> (Env, String)
    50 gensym ([], s) = ([(s, 0)], s ++ show 0)
    51 gensym ((prefix1, count1):env, s)
    52   | prefix1 == s = ((s, count1+1):env, s ++ show(count1+1))
    53   | otherwise    = ((prefix1, count1):newenv, newsym)
    54   where (newenv, newsym) = gensym(env, s)
    55 
    56 
    57 --
    58 -- To keep track of the current context for code generation during
    59 -- compilation, we define a CodeGenState that represents the output
    60 -- at each compilation step.   This state consists of an environment
    61 -- defining the symbols used so far, the sequence of generated PabloS
    62 -- statements generated so far and the name of the variable that
    63 -- holds the match result for the overall expression so far.
    64 --
    65 type CodeGenState = (Env, [PabloS], String)
    66 
    67 --
    68 -- In compiling to regular expression form, we use a helper function
    69 -- that takes a regular expression and a code generation state, and
    70 -- returns the code generation state.   The main compile routine
    71 -- initializes the process by setting an initial marker stream
    72 -- of all ones.
    73 --
    74 --
    75 compile :: RE -> CodeGenState
    76 re2pablo_helper :: (RE, CodeGenState) -> CodeGenState
    77 
    78 -- We also define helper functions for the working through
    79 -- the RE lists in Seq and Alt structures, as well as the
    80 -- iteration required for Rep structures.
    81 seq_helper :: ([RE], CodeGenState) -> CodeGenState
    82 alt_helper :: ([RE], CodeGenState) -> CodeGenState
    83 rep_helper :: (RE, Int, RepLimit, CodeGenState) -> CodeGenState
    84 
    85 compile(re) = re2pablo_helper(re, (env, [Assign(marker, All(1))], marker))
    86   where
    87     (env, marker) = gensym([], "start_marker")
    88 
    89 -- To match a character class, use bitwise and of the marker and
    90 -- character class, then advance 1. 
    91 
    92 re2pablo_helper(CC(c), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
    93    where
    94      (newenv, newsym) = gensym(env, "marker")
    95      new_stmt = Assign (newsym, Advance(And(Var(last_marker), CharClass(c))))
    96 
    97 -- To match "^" we must be at the start of line, i.e., one past
    98 -- a newline character or at the beginning of file.  We use the
    99 -- trick of advancing the negated newline stream to determine these
    100 -- positions.
    101 --
    102 re2pablo_helper(Start, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
    103    where
    104      (newenv, newsym) = gensym(env, "start_of_line_marker")
    105      new_stmt = Assign (newsym, And(Var(last_marker), Not(Advance(Not(CharClass("\n"))))))
    106 
    107 -- To match "$" we must have reached end of line.
    108 --
    109 re2pablo_helper(End, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
    110    where
    111      (newenv, newsym) = gensym(env, "end_of_line_marker")
    112      new_stmt = Assign (newsym, And(Var(last_marker), CharClass("\n")))
    113 
    114 -- For the structured types (Seq, Alt, Rep), just call the specific helper.
    115 --
    116 re2pablo_helper(Seq rs, cg_state) = seq_helper(rs, cg_state)
    117 re2pablo_helper(Alt rs, cg_state) = alt_helper(rs, cg_state)
    118 re2pablo_helper(Rep (r, lb, ub), cg_state) = rep_helper(r, lb, ub, cg_state)
    119 
    120 
    121 -- Seq [] is the empty regexp which matches the empty string. 
    122 -- We just leave the current marker unchanged, with no new statements
    123 -- needed to compute it.
    124 --
    125 seq_helper([], cg_state) = cg_state
    126 -- Seq (r1:rs): generate code to match r1, the first subexpression,
    127 -- then to match the rest, concatenating the statements.
    128 seq_helper((r1:rs), cg_state) = seq_helper(rs, re2pablo_helper(r1, cg_state))
    129 
    130 -- Alt[r] has a single alternative r to match, just match
    131 -- it with no other possibility.
    132 alt_helper([r], cg_state) = re2pablo_helper(r, cg_state)
    133 
    134 -- For completeness, we define Alt[] as the regular expression that
    135 -- always fails (since no alternatives will match).
    136 alt_helper([], (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
    137    where
    138      (newenv, newsym) = gensym(env, "always_fail_marker")
    139      new_stmt = Assign (newsym, All(0))
    140 
    141 -- Alt (r1:rs): we have r1 and some other alternatives, the
    142 -- match succeeds if either r1 matches or any of the others
    143 -- does.
    144 alt_helper(r1:rs, (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
    145   where
    146    (e1, s1, alt1_marker) = re2pablo_helper(r1, (env, stmts, last_marker))
    147    (e2, s2, alt2_marker) = alt_helper(rs, (e1, s1, last_marker))
    148    (newenv, newsym) = gensym(e2, "alt_marker")
    149    new_stmts = s2 ++ [Assign (newsym, Or(Var(alt1_marker), Var(alt2_marker)))]
    150 
    151 -- For Kleene Star character class repetition, use MatchStar
    152 rep_helper(CC c, 0, Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
    153    where
    154      (newenv, newsym) = gensym(env, "marker")
    155      new_stmt = Assign (newsym, MatchStar(Var(last_marker), CharClass(c)))
    156 
    157 -- For general Kleene Star, we need a while loop.
    158 rep_helper(r, 0, Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
    159   where
    160     (e1, while_test) = gensym(env, "while_test")
    161     (e2, while_accum) = gensym(e1, "while_accum")
    162     (newenv, loop_body_stmts, loop_result) = re2pablo_helper(r, (e2, [], while_test))
    163     while_init = [Assign(while_test, Var(last_marker)), Assign(while_accum, Var(last_marker))]
    164     repeat = [Assign(while_test, And(Var(loop_result), Not(Var(while_accum)))),
    165               Assign(while_accum, Or(Var(while_accum), Var(loop_result)))]
    166     loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat)
    167 
    168 rep_helper(r, lb, Unbounded, cg_state) = rep_helper(r, lb-1, Unbounded, cg1_state)
    169   where
    170    cg1_state = re2pablo_helper(r, cg_state)
    171 
    172 -- Now Bounded Repetition: use multiple copies
    173    
    174 rep_helper(r, 0, UpperBound 0, cg_state) = cg_state
    175 
    176 rep_helper(r, 0, UpperBound ub, (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
    177   where
    178    (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker))
    179    (e2, s2, rep2plus_marker) = rep_helper(r, 0, UpperBound(ub-1), (e1, s1, rep1_marker))
    180    (newenv, newsym) = gensym(e2, "alt_marker")
    181    new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))]
    182 
    183 rep_helper(r, lb, UpperBound ub, cg_state) = rep_helper(r, lb-1, UpperBound(ub-1), cg1_state)
    184   where
    185    cg1_state = re2pablo_helper(r, cg_state)
    186 
    187 ---------------
    188 
    189 searchRE :: (RE, String) -> PabloVal
    190 searchRE(regexp, str) = evalPabloS(compiled, marker, str)
    191   where (e, compiled, marker) = compile(regexp)
    192        
Note: See TracChangeset for help on using the changeset viewer.