Changeset 3596


Ignore:
Timestamp:
Dec 26, 2013, 12:19:10 PM (6 years ago)
Author:
cameron
Message:

New code generator state, gensym function

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/RE/doc/CanonicalRE.hs

    r3595 r3596  
    1212import Data.Char
    1313
    14 data RepLimit = UpperBound Int | UnBounded deriving Show
     14data RepLimit = UpperBound Int | Unbounded deriving Show
    1515upper_bound_dec :: RepLimit -> RepLimit
    1616upper_bound_dec (UpperBound j) = UpperBound (j - 1)
    17 upper_bound_dec UnBounded = UnBounded
     17upper_bound_dec Unbounded = Unbounded
    1818
    1919data RE = CC String | Seq [RE] | Alt [RE] | Rep RE Int RepLimit |
     
    2121-- Rep r i j represents UpperBound repetition, with lower bound i and
    2222-- upper bound j.   We use the convention that any negative value of
    23 -- j represents UnBounded repetition.
     23-- j represents Unbounded repetition.
    2424
    2525canonicalize :: Proto.RE -> RE
     
    3232canonicalize (Proto.Join rs) = Seq (map canonicalize rs)
    3333canonicalize (Proto.Alt rs) = Alt (map canonicalize rs)
    34 canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 (UpperBound 1)
    35 canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 UnBounded
    36 canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 UnBounded
    37 canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i UnBounded
     34canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 (UpperBound 1) 
     35canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 Unbounded
     36canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 Unbounded
     37canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i Unbounded
    3838canonicalize (Proto.Bounded i j r) = Rep (canonicalize r) i (UpperBound j)
    3939
     
    7777-- Regular Expression Compiler
    7878
    79 data PabloE = Marker(Int) | And(PabloE, PabloE) | Or(PabloE, PabloE) | Not(PabloE)
     79data PabloE = All(Int) | Var(String) | And(PabloE, PabloE) | Or(PabloE, PabloE) | Not(PabloE)
    8080              | CharClass(String) | Advance(PabloE) | MatchStar(PabloE, PabloE)
    8181   deriving Show
    8282
    83 data PabloS = SetMarker(Int, PabloE) | ReturnMarker(Int)
    84               | If (PabloE, [PabloS])| While (PabloE, [PabloS])
     83data PabloS = Assign(String, PabloE) |  If (PabloE, [PabloS])| While (PabloE, [PabloS])
    8584   deriving Show
    8685
    87 compile :: RE -> [PabloS]
    88 
    89 compile(e) = stmts ++ [ReturnMarker m]
    90   where (m, stmts) = re2pablo_helper(e, 0)
    91 
    92 re2pablo_helper :: (RE, Int) -> (Int, [PabloS])
     86
     87--
     88-- The Symbol Environment keeps track of the symbols generated
     89-- during compilation, so that new unique symbols can be
     90-- generated (gensym) as needed.   All symbols are formed from
     91-- a string prefix and a numeric suffix.
     92--
     93-- For example, if the symbols "marker0", "marker1", "condition0"
     94-- have been generated and no others have, the environment is
     95-- [("marker", 1), ("condition", 0)]
     96--
     97type Env = [(String, Int)]
     98
     99gensym :: (Env, String) -> (Env, String)
     100gensym ([], s) = ([(s, 0)], s ++ show 0)
     101gensym ((prefix1, count1):env, s)
     102  | prefix1 == s = ((s, count1+1):env, s ++ show(count1+1))
     103  | otherwise    = ((prefix1, count1):newenv, newsym)
     104  where (newenv, newsym) = gensym(env, s)
     105
     106
     107--
     108-- To keep track of the current context for code generation during
     109-- compilation, we define a CodeGenState that represents the output
     110-- at each compilation step.   This state consists of an environment
     111-- defining the symbols used so far, the sequence of generated PabloS
     112-- statements generated so far and the name of the variable that
     113-- holds the match result for the overall expression so far.
     114--
     115type CodeGenState = (Env, [PabloS], String)
     116
     117--
     118-- In compiling to regular expression form, we use a helper function
     119-- that takes a regular expression and a code generation state, and
     120-- returns the code generation state.   The main compile routine
     121-- initializes the process by setting an initial marker stream
     122-- of all ones.
     123--
     124--
     125compile :: RE -> CodeGenState
     126re2pablo_helper :: (RE, CodeGenState) -> CodeGenState
     127
     128
     129compile(re) = re2pablo_helper(re, (env, [Assign(marker, All(1))], marker))
     130  where
     131    (env, marker) = gensym([], "start_marker")
    93132
    94133-- To match a character class, use bitwise and of the marker and
    95 -- character class, then advance 1.  The output marker variable
    96 -- is the next one in sequence.
    97 
    98 re2pablo_helper(CC(c), m) =
    99    (m + 1, [SetMarker (m + 1, Advance(And(Marker(m), CharClass(c))))])
     134-- character class, then advance 1. 
     135
     136re2pablo_helper(CC(c), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
     137   where
     138     (newenv, newsym) = gensym(env, "marker")
     139     new_stmt = Assign (newsym, Advance(And(Var(last_marker), CharClass(c))))
    100140
    101141-- To match "^" we must be at the start of line, i.e., one past
     
    104144-- positions.
    105145--
    106 re2pablo_helper(Start, m) =
    107   (m+1, [SetMarker(m+1, And(Marker m, Not(Advance(Not(CharClass("\n"))))))])
     146re2pablo_helper(Start, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
     147   where
     148     (newenv, newsym) = gensym(env, "start_of_line_marker")
     149     new_stmt = Assign (newsym, And(Var(last_marker), Not(Advance(Not(CharClass("\n"))))))
    108150
    109151-- To match "$" we must have reached end of line.
    110152--
    111 re2pablo_helper(End, m) =
    112   (m+1, [SetMarker(m+1, And(Marker m, CharClass("\n")))])
     153re2pablo_helper(End, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
     154   where
     155     (newenv, newsym) = gensym(env, "end_of_line_marker")
     156     new_stmt = Assign (newsym, And(Var(last_marker), CharClass("\n")))
    113157
    114158-- Seq [] is the empty regexp which matches the empty string. 
    115 -- We just return the current marker unchanged, with no statements
     159-- We just leave the current marker unchanged, with no new statements
    116160-- needed to compute it.
    117161--
    118 re2pablo_helper(Seq [], m) = (m, [])
    119 
    120 -- Seq (r1:rs): match r1, the first subexpression, then match
    121 -- the rest, concatenating the statements.
    122 re2pablo_helper(Seq (r1:rs), m) = (m2, s1 ++ s2)
    123   where
    124    (m1, s1) = re2pablo_helper(r1, m)
    125    (m2, s2) = re2pablo_helper(Seq rs, m1)
    126 
     162re2pablo_helper(Seq [], cg_state) = cg_state
     163
     164-- Seq (r1:rs): generate code to match r1, the first subexpression,
     165-- then to match the rest, concatenating the statements.
     166re2pablo_helper(Seq (r1:rs), cg_state) = re2pablo_helper(Seq rs, re2pablo_helper(r1, cg_state))
     167 
    127168-- Alt[r] has a single alternative r to match, just match
    128169-- it with no other possibility.
    129 re2pablo_helper(Alt [r], m) = re2pablo_helper(r, m)
     170re2pablo_helper(Alt [r], cg_state) = re2pablo_helper(r, cg_state)
     171
     172-- For completeness, we define Alt[] as the regular expression that
     173-- always fails (since no alternatives will match).
     174-- it with no other possibility.
     175re2pablo_helper(Alt [], (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
     176   where
     177     (newenv, newsym) = gensym(env, "always_fail_marker")
     178     new_stmt = Assign (newsym, All(0))
    130179
    131180-- Alt (r1:rs): we have r1 and some other alternatives, the
    132181-- match succeeds if either r1 matches or any of the others
    133182-- does.
    134 
    135 re2pablo_helper(Alt (r1:rs), m) = (m2 + 1, save ++ s1 ++ s2 ++ final)
    136   where
    137    (m1, s1) = re2pablo_helper(r1, m)
    138    (m2, s2) = re2pablo_helper(Alt rs, m1+1)
    139    save = [SetMarker(m1 + 1, Marker(m))]
    140    final = [SetMarker(m2 + 1, Or(Marker(m1), Marker(m2)))]
    141 
    142 re2pablo_helper(Rep (CC c) 0 UnBounded, m) =
    143    (m + 1, [SetMarker(m + 1, MatchStar(Marker(m), CharClass(c)))])
    144 
    145 re2pablo_helper(Rep r 0 UnBounded, m) =
    146   (m1+1, save ++ [While (Marker(m+1), s1 ++ loopfinal)])
    147   where
    148    (m1, s1) = re2pablo_helper(r, m+1)
    149    save = [SetMarker(m+1, Marker(m)), SetMarker(m1+1, Marker(m))]
    150    loopfinal = [SetMarker(m+1, And(Marker(m1), Not(Marker(m1+1)))), SetMarker(m1+1, Or(Marker(m1+1), Marker(m1)))]
     183re2pablo_helper(Alt (r1:rs), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
     184  where
     185   (e1, s1, alt1_marker) = re2pablo_helper(r1, (env, stmts, last_marker))
     186   (e2, s2, alt2_marker) = re2pablo_helper(Alt rs, (e1, s1, last_marker))
     187   (newenv, newsym) = gensym(e2, "alt_marker")
     188   new_stmts = s2 ++ [Assign (newsym, Or(Var(alt1_marker), Var(alt2_marker)))]
     189
     190-- Repetition Matching Rules
     191--
     192-- For Kleene Star character class repetition, use MatchStar
     193re2pablo_helper(Rep (CC c) 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
     194   where
     195     (newenv, newsym) = gensym(env, "marker")
     196     new_stmt = Assign (newsym, MatchStar(Var(last_marker), CharClass(c)))
     197
     198-- For general Kleene Star, we need a while loop.
     199re2pablo_helper(Rep r 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
     200  where
     201    (e1, while_test) = gensym(env, "while_test")
     202    (e2, while_accum) = gensym(e1, "while_accum")
     203    (newenv, loop_body_stmts, loop_result) = re2pablo_helper(r, (e2, [], while_test))
     204    while_init = [Assign(while_test, Var(last_marker)), Assign(while_accum, Var(last_marker))]
     205    repeat = [Assign(while_test, And(Var(loop_result), Not(Var(while_accum)))),
     206              Assign(while_accum, Or(Var(while_accum), Var(loop_result)))]
     207    loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat)
     208
     209-- Now Bounded Repetition: use multiple copies
    151210   
    152 re2pablo_helper(Rep r 0 (UpperBound 0), m) = (m, [])
    153 
    154 re2pablo_helper(Rep r 0 (UpperBound ub), m) =
    155   (m2 + 1, s1 ++ s2 ++ [SetMarker (m2 + 1, Or(Marker m1, Marker m2))])
    156   where
    157    (m1, s1) = re2pablo_helper(r, m)
    158    (m2, s2) = re2pablo_helper(Rep r 0 (UpperBound(ub-1)), m1)
    159 
    160 re2pablo_helper(Rep r lb (UpperBound ub), m) =  (m2, s1 ++ s2)
    161   where
    162    (m1, s1) = re2pablo_helper(r, m)
    163    (m2, s2) = re2pablo_helper(Rep r (lb-1) (UpperBound(ub-1)), m1)
    164 
     211re2pablo_helper(Rep r 0 (UpperBound 0), cg_state) = cg_state
     212
     213re2pablo_helper(Rep r 0 (UpperBound ub), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
     214  where
     215   (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker))
     216   (e2, s2, rep2plus_marker) = re2pablo_helper(Rep r 0 (UpperBound(ub-1)), (e1, s1, rep1_marker))
     217   (newenv, newsym) = gensym(e2, "alt_marker")
     218   new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))]
     219
     220re2pablo_helper(Rep r lb (UpperBound ub), cg_state) = re2pablo_helper(Rep r (lb-1) (UpperBound(ub-1)), cg1_state)
     221  where
     222   cg1_state = re2pablo_helper(r, cg_state)
     223
     224
Note: See TracChangeset for help on using the changeset viewer.