Changeset 3596
 Timestamp:
 Dec 26, 2013, 12:19:10 PM (6 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

proto/RE/doc/CanonicalRE.hs
r3595 r3596 12 12 import Data.Char 13 13 14 data RepLimit = UpperBound Int  Un Bounded deriving Show14 data RepLimit = UpperBound Int  Unbounded deriving Show 15 15 upper_bound_dec :: RepLimit > RepLimit 16 16 upper_bound_dec (UpperBound j) = UpperBound (j  1) 17 upper_bound_dec Un Bounded = UnBounded17 upper_bound_dec Unbounded = Unbounded 18 18 19 19 data RE = CC String  Seq [RE]  Alt [RE]  Rep RE Int RepLimit  … … 21 21  Rep r i j represents UpperBound repetition, with lower bound i and 22 22  upper bound j. We use the convention that any negative value of 23  j represents Un Bounded repetition.23  j represents Unbounded repetition. 24 24 25 25 canonicalize :: Proto.RE > RE … … 32 32 canonicalize (Proto.Join rs) = Seq (map canonicalize rs) 33 33 canonicalize (Proto.Alt rs) = Alt (map canonicalize rs) 34 canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 (UpperBound 1) 35 canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 Un Bounded36 canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 Un Bounded37 canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i Un Bounded34 canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 (UpperBound 1) 35 canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 Unbounded 36 canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 Unbounded 37 canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i Unbounded 38 38 canonicalize (Proto.Bounded i j r) = Rep (canonicalize r) i (UpperBound j) 39 39 … … 77 77  Regular Expression Compiler 78 78 79 data PabloE = Marker(Int)  And(PabloE, PabloE)  Or(PabloE, PabloE)  Not(PabloE)79 data PabloE = All(Int)  Var(String)  And(PabloE, PabloE)  Or(PabloE, PabloE)  Not(PabloE) 80 80  CharClass(String)  Advance(PabloE)  MatchStar(PabloE, PabloE) 81 81 deriving Show 82 82 83 data PabloS = SetMarker(Int, PabloE)  ReturnMarker(Int) 84  If (PabloE, [PabloS]) While (PabloE, [PabloS]) 83 data PabloS = Assign(String, PabloE)  If (PabloE, [PabloS]) While (PabloE, [PabloS]) 85 84 deriving Show 86 85 87 compile :: RE > [PabloS] 88 89 compile(e) = stmts ++ [ReturnMarker m] 90 where (m, stmts) = re2pablo_helper(e, 0) 91 92 re2pablo_helper :: (RE, Int) > (Int, [PabloS]) 86 87  88  The Symbol Environment keeps track of the symbols generated 89  during compilation, so that new unique symbols can be 90  generated (gensym) as needed. All symbols are formed from 91  a string prefix and a numeric suffix. 92  93  For example, if the symbols "marker0", "marker1", "condition0" 94  have been generated and no others have, the environment is 95  [("marker", 1), ("condition", 0)] 96  97 type Env = [(String, Int)] 98 99 gensym :: (Env, String) > (Env, String) 100 gensym ([], s) = ([(s, 0)], s ++ show 0) 101 gensym ((prefix1, count1):env, s) 102  prefix1 == s = ((s, count1+1):env, s ++ show(count1+1)) 103  otherwise = ((prefix1, count1):newenv, newsym) 104 where (newenv, newsym) = gensym(env, s) 105 106 107  108  To keep track of the current context for code generation during 109  compilation, we define a CodeGenState that represents the output 110  at each compilation step. This state consists of an environment 111  defining the symbols used so far, the sequence of generated PabloS 112  statements generated so far and the name of the variable that 113  holds the match result for the overall expression so far. 114  115 type CodeGenState = (Env, [PabloS], String) 116 117  118  In compiling to regular expression form, we use a helper function 119  that takes a regular expression and a code generation state, and 120  returns the code generation state. The main compile routine 121  initializes the process by setting an initial marker stream 122  of all ones. 123  124  125 compile :: RE > CodeGenState 126 re2pablo_helper :: (RE, CodeGenState) > CodeGenState 127 128 129 compile(re) = re2pablo_helper(re, (env, [Assign(marker, All(1))], marker)) 130 where 131 (env, marker) = gensym([], "start_marker") 93 132 94 133  To match a character class, use bitwise and of the marker and 95  character class, then advance 1. The output marker variable 96  is the next one in sequence. 97 98 re2pablo_helper(CC(c), m) = 99 (m + 1, [SetMarker (m + 1, Advance(And(Marker(m), CharClass(c))))]) 134  character class, then advance 1. 135 136 re2pablo_helper(CC(c), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym) 137 where 138 (newenv, newsym) = gensym(env, "marker") 139 new_stmt = Assign (newsym, Advance(And(Var(last_marker), CharClass(c)))) 100 140 101 141  To match "^" we must be at the start of line, i.e., one past … … 104 144  positions. 105 145  106 re2pablo_helper(Start, m) = 107 (m+1, [SetMarker(m+1, And(Marker m, Not(Advance(Not(CharClass("\n"))))))]) 146 re2pablo_helper(Start, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym) 147 where 148 (newenv, newsym) = gensym(env, "start_of_line_marker") 149 new_stmt = Assign (newsym, And(Var(last_marker), Not(Advance(Not(CharClass("\n")))))) 108 150 109 151  To match "$" we must have reached end of line. 110 152  111 re2pablo_helper(End, m) = 112 (m+1, [SetMarker(m+1, And(Marker m, CharClass("\n")))]) 153 re2pablo_helper(End, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym) 154 where 155 (newenv, newsym) = gensym(env, "end_of_line_marker") 156 new_stmt = Assign (newsym, And(Var(last_marker), CharClass("\n"))) 113 157 114 158  Seq [] is the empty regexp which matches the empty string. 115  We just return the current marker unchanged, with nostatements159  We just leave the current marker unchanged, with no new statements 116 160  needed to compute it. 117 161  118 re2pablo_helper(Seq [], m) = (m, []) 119 120  Seq (r1:rs): match r1, the first subexpression, then match 121  the rest, concatenating the statements. 122 re2pablo_helper(Seq (r1:rs), m) = (m2, s1 ++ s2) 123 where 124 (m1, s1) = re2pablo_helper(r1, m) 125 (m2, s2) = re2pablo_helper(Seq rs, m1) 126 162 re2pablo_helper(Seq [], cg_state) = cg_state 163 164  Seq (r1:rs): generate code to match r1, the first subexpression, 165  then to match the rest, concatenating the statements. 166 re2pablo_helper(Seq (r1:rs), cg_state) = re2pablo_helper(Seq rs, re2pablo_helper(r1, cg_state)) 167 127 168  Alt[r] has a single alternative r to match, just match 128 169  it with no other possibility. 129 re2pablo_helper(Alt [r], m) = re2pablo_helper(r, m) 170 re2pablo_helper(Alt [r], cg_state) = re2pablo_helper(r, cg_state) 171 172  For completeness, we define Alt[] as the regular expression that 173  always fails (since no alternatives will match). 174  it with no other possibility. 175 re2pablo_helper(Alt [], (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym) 176 where 177 (newenv, newsym) = gensym(env, "always_fail_marker") 178 new_stmt = Assign (newsym, All(0)) 130 179 131 180  Alt (r1:rs): we have r1 and some other alternatives, the 132 181  match succeeds if either r1 matches or any of the others 133 182  does. 134 135 re2pablo_helper(Alt (r1:rs), m) = (m2 + 1, save ++ s1 ++ s2 ++ final) 136 where 137 (m1, s1) = re2pablo_helper(r1, m) 138 (m2, s2) = re2pablo_helper(Alt rs, m1+1) 139 save = [SetMarker(m1 + 1, Marker(m))] 140 final = [SetMarker(m2 + 1, Or(Marker(m1), Marker(m2)))] 141 142 re2pablo_helper(Rep (CC c) 0 UnBounded, m) = 143 (m + 1, [SetMarker(m + 1, MatchStar(Marker(m), CharClass(c)))]) 144 145 re2pablo_helper(Rep r 0 UnBounded, m) = 146 (m1+1, save ++ [While (Marker(m+1), s1 ++ loopfinal)]) 147 where 148 (m1, s1) = re2pablo_helper(r, m+1) 149 save = [SetMarker(m+1, Marker(m)), SetMarker(m1+1, Marker(m))] 150 loopfinal = [SetMarker(m+1, And(Marker(m1), Not(Marker(m1+1)))), SetMarker(m1+1, Or(Marker(m1+1), Marker(m1)))] 183 re2pablo_helper(Alt (r1:rs), (env, stmts, last_marker)) = (newenv, new_stmts, newsym) 184 where 185 (e1, s1, alt1_marker) = re2pablo_helper(r1, (env, stmts, last_marker)) 186 (e2, s2, alt2_marker) = re2pablo_helper(Alt rs, (e1, s1, last_marker)) 187 (newenv, newsym) = gensym(e2, "alt_marker") 188 new_stmts = s2 ++ [Assign (newsym, Or(Var(alt1_marker), Var(alt2_marker)))] 189 190  Repetition Matching Rules 191  192  For Kleene Star character class repetition, use MatchStar 193 re2pablo_helper(Rep (CC c) 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym) 194 where 195 (newenv, newsym) = gensym(env, "marker") 196 new_stmt = Assign (newsym, MatchStar(Var(last_marker), CharClass(c))) 197 198  For general Kleene Star, we need a while loop. 199 re2pablo_helper(Rep r 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum) 200 where 201 (e1, while_test) = gensym(env, "while_test") 202 (e2, while_accum) = gensym(e1, "while_accum") 203 (newenv, loop_body_stmts, loop_result) = re2pablo_helper(r, (e2, [], while_test)) 204 while_init = [Assign(while_test, Var(last_marker)), Assign(while_accum, Var(last_marker))] 205 repeat = [Assign(while_test, And(Var(loop_result), Not(Var(while_accum)))), 206 Assign(while_accum, Or(Var(while_accum), Var(loop_result)))] 207 loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat) 208 209  Now Bounded Repetition: use multiple copies 151 210 152 re2pablo_helper(Rep r 0 (UpperBound 0), m) = (m, []) 153 154 re2pablo_helper(Rep r 0 (UpperBound ub), m) = 155 (m2 + 1, s1 ++ s2 ++ [SetMarker (m2 + 1, Or(Marker m1, Marker m2))]) 156 where 157 (m1, s1) = re2pablo_helper(r, m) 158 (m2, s2) = re2pablo_helper(Rep r 0 (UpperBound(ub1)), m1) 159 160 re2pablo_helper(Rep r lb (UpperBound ub), m) = (m2, s1 ++ s2) 161 where 162 (m1, s1) = re2pablo_helper(r, m) 163 (m2, s2) = re2pablo_helper(Rep r (lb1) (UpperBound(ub1)), m1) 164 211 re2pablo_helper(Rep r 0 (UpperBound 0), cg_state) = cg_state 212 213 re2pablo_helper(Rep r 0 (UpperBound ub), (env, stmts, last_marker)) = (newenv, new_stmts, newsym) 214 where 215 (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker)) 216 (e2, s2, rep2plus_marker) = re2pablo_helper(Rep r 0 (UpperBound(ub1)), (e1, s1, rep1_marker)) 217 (newenv, newsym) = gensym(e2, "alt_marker") 218 new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))] 219 220 re2pablo_helper(Rep r lb (UpperBound ub), cg_state) = re2pablo_helper(Rep r (lb1) (UpperBound(ub1)), cg1_state) 221 where 222 cg1_state = re2pablo_helper(r, cg_state) 223 224
Note: See TracChangeset
for help on using the changeset viewer.