source: proto/RE/doc/CanonicalRE.hs @ 3598

Last change on this file since 3598 was 3598, checked in by cameron, 5 years ago

Minor updates

File size: 9.4 KB
Line 
1-- Module Canonical_RE is a canonical representation for regular
2-- expressions that uses a minimum number of alternative forms,
3-- for example, in comparison with module RE_Proto.
4-- Use of this canonical representation should simplify transformation and
5-- analysis algorithms. 
6-- Compare for example, RE_proto.minMatchLen with CanonicalRE.minMatchLen
7
8
9module CanonicalRE (RE(..), canonicalize, minMatchLen, removeNullablePrefix) where
10       
11import qualified RE_Proto as Proto
12import Data.Char
13
14data RepLimit = UpperBound Int | Unbounded deriving Show
15upper_bound_dec :: RepLimit -> RepLimit
16upper_bound_dec (UpperBound j) = UpperBound (j - 1)
17upper_bound_dec Unbounded = Unbounded
18
19data RE = CC String | Seq [RE] | Alt [RE] | Rep RE Int RepLimit |
20          Start | End  -- lookbehind/lookahead assertions for newline
21          deriving Show
22-- Rep r i j represents UpperBound repetition, with lower bound i and
23-- upper bound j.   We use the convention that any negative value of
24-- j represents Unbounded repetition.
25
26canonicalize :: Proto.RE -> RE
27canonicalize (Proto.CC s) = CC s
28canonicalize (Proto.CCnot s) =  CC (filter (\c -> not(elem c s)) (map chr ([0..9]++[11..127])))
29canonicalize (Proto.Lit s) = Seq (map (\c -> CC [c]) s)
30canonicalize Proto.Start = Start
31canonicalize Proto.End = End
32canonicalize Proto.Any = CC (map chr ([0..9]++[11..127]))
33canonicalize (Proto.Join rs) = Seq (map canonicalize rs)
34canonicalize (Proto.Alt rs) = Alt (map canonicalize rs)
35canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 (UpperBound 1) 
36canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 Unbounded
37canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 Unbounded
38canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i Unbounded
39canonicalize (Proto.Bounded i j r) = Rep (canonicalize r) i (UpperBound j)
40
41
42-- Write a minimal match length function that determines the minimum length
43-- string that can be matched by an RE, considering that Start and End match
44-- single \n characterProto.
45
46minMatchLen :: RE -> Int
47minMatchLen (CC s) = 1
48minMatchLen Start = 1
49minMatchLen End = 1
50minMatchLen (Seq []) = 0
51minMatchLen (Seq (r:rs)) = minMatchLen r + (minMatchLen (Seq rs))
52minMatchLen (Alt [r]) = minMatchLen r
53minMatchLen (Alt (r:rs)) = min (minMatchLen r) (minMatchLen (Alt rs))
54minMatchLen (Rep r i j) = i * (minMatchLen r)
55
56--- removeNullablePrefix takes a regular expression and returns
57--- the version of it that has been transformed to remove any nullable
58--- prefixeProto.  A prefix p is nullable if it can possibly match the empty
59--- string, i.e., minMatchLen(p) = 0.  For example, in the regexp
60--- [a-z]*-[0-9]+ the prefix [a-z]* is nullable.
61
62removeNullablePrefix :: RE -> RE
63removeNullablePrefix (CC s) = (CC s)
64removeNullablePrefix Start = Start
65removeNullablePrefix End = End
66removeNullablePrefix (Seq []) = Seq []
67removeNullablePrefix (Seq (r:rs))
68   | minMatchLen(r) == 0  = removeNullablePrefix(Seq rs)
69   | otherwise            = Seq ((removeNullablePrefix r):rs)
70removeNullablePrefix (Alt rs) = Alt (map removeNullablePrefix rs)
71removeNullablePrefix (Rep r 0 ub) = Seq []
72removeNullablePrefix (Rep r lb ub)
73   | minMatchLen(r) == 0  = Seq []
74   | otherwise            = Seq [removeNullablePrefix(r), Rep r (lb-1) (upper_bound_dec ub)]
75
76
77--
78-- Regular Expression Compiler
79
80data PabloE = All(Int) | Var(String) | And(PabloE, PabloE) | Or(PabloE, PabloE) | Not(PabloE)
81              | CharClass(String) | Advance(PabloE) | MatchStar(PabloE, PabloE) 
82   deriving Show 
83
84data PabloS = Assign(String, PabloE) |  If (PabloE, [PabloS])| While (PabloE, [PabloS]) 
85   deriving Show
86
87
88--
89-- The Symbol Environment keeps track of the symbols generated
90-- during compilation, so that new unique symbols can be
91-- generated (gensym) as needed.   All symbols are formed from
92-- a string prefix and a numeric suffix.
93--
94-- For example, if the symbols "marker0", "marker1", "condition0"
95-- have been generated and no others have, the environment is
96-- [("marker", 1), ("condition", 0)]
97--
98type Env = [(String, Int)]
99
100gensym :: (Env, String) -> (Env, String)
101gensym ([], s) = ([(s, 0)], s ++ show 0)
102gensym ((prefix1, count1):env, s) 
103  | prefix1 == s = ((s, count1+1):env, s ++ show(count1+1))
104  | otherwise    = ((prefix1, count1):newenv, newsym)
105  where (newenv, newsym) = gensym(env, s)
106
107
108--
109-- To keep track of the current context for code generation during
110-- compilation, we define a CodeGenState that represents the output
111-- at each compilation step.   This state consists of an environment
112-- defining the symbols used so far, the sequence of generated PabloS
113-- statements generated so far and the name of the variable that
114-- holds the match result for the overall expression so far.
115--
116type CodeGenState = (Env, [PabloS], String)
117
118--
119-- In compiling to regular expression form, we use a helper function
120-- that takes a regular expression and a code generation state, and
121-- returns the code generation state.   The main compile routine
122-- initializes the process by setting an initial marker stream
123-- of all ones.
124--
125--
126compile :: RE -> CodeGenState
127re2pablo_helper :: (RE, CodeGenState) -> CodeGenState
128
129
130compile(re) = re2pablo_helper(re, (env, [Assign(marker, All(1))], marker))
131  where 
132    (env, marker) = gensym([], "start_marker")
133
134-- To match a character class, use bitwise and of the marker and
135-- character class, then advance 1. 
136
137re2pablo_helper(CC(c), (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
138   where 
139     (newenv, newsym) = gensym(env, "marker")
140     new_stmt = Assign (newsym, Advance(And(Var(last_marker), CharClass(c))))
141
142-- To match "^" we must be at the start of line, i.e., one past
143-- a newline character or at the beginning of file.  We use the
144-- trick of advancing the negated newline stream to determine these
145-- positions.
146--
147re2pablo_helper(Start, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
148   where 
149     (newenv, newsym) = gensym(env, "start_of_line_marker")
150     new_stmt = Assign (newsym, And(Var(last_marker), Not(Advance(Not(CharClass("\n"))))))
151
152-- To match "$" we must have reached end of line.
153--
154re2pablo_helper(End, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
155   where 
156     (newenv, newsym) = gensym(env, "end_of_line_marker")
157     new_stmt = Assign (newsym, And(Var(last_marker), CharClass("\n")))
158
159-- Seq [] is the empty regexp which matches the empty string. 
160-- We just leave the current marker unchanged, with no new statements
161-- needed to compute it.
162--
163re2pablo_helper(Seq [], cg_state) = cg_state
164
165-- Seq (r1:rs): generate code to match r1, the first subexpression,
166-- then to match the rest, concatenating the statements.
167re2pablo_helper(Seq (r1:rs), cg_state) = re2pablo_helper(Seq rs, re2pablo_helper(r1, cg_state))
168 
169-- Alt[r] has a single alternative r to match, just match
170-- it with no other possibility.
171re2pablo_helper(Alt [r], cg_state) = re2pablo_helper(r, cg_state)
172
173-- For completeness, we define Alt[] as the regular expression that
174-- always fails (since no alternatives will match).
175re2pablo_helper(Alt [], (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
176   where 
177     (newenv, newsym) = gensym(env, "always_fail_marker")
178     new_stmt = Assign (newsym, All(0))
179
180-- Alt (r1:rs): we have r1 and some other alternatives, the
181-- match succeeds if either r1 matches or any of the others
182-- does.
183re2pablo_helper(Alt (r1:rs), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
184  where 
185   (e1, s1, alt1_marker) = re2pablo_helper(r1, (env, stmts, last_marker))
186   (e2, s2, alt2_marker) = re2pablo_helper(Alt rs, (e1, s1, last_marker))
187   (newenv, newsym) = gensym(e2, "alt_marker")
188   new_stmts = s2 ++ [Assign (newsym, Or(Var(alt1_marker), Var(alt2_marker)))]
189
190-- Repetition Matching Rules
191--
192-- For Kleene Star character class repetition, use MatchStar
193re2pablo_helper(Rep (CC c) 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ [new_stmt], newsym)
194   where 
195     (newenv, newsym) = gensym(env, "marker")
196     new_stmt = Assign (newsym, MatchStar(Var(last_marker), CharClass(c))) 
197
198-- For general Kleene Star, we need a while loop.
199re2pablo_helper(Rep r 0 Unbounded, (env, stmts, last_marker)) = (newenv, stmts ++ while_init ++ [loop_stmt], while_accum)
200  where
201    (e1, while_test) = gensym(env, "while_test")
202    (e2, while_accum) = gensym(e1, "while_accum")
203    (newenv, loop_body_stmts, loop_result) = re2pablo_helper(r, (e2, [], while_test))
204    while_init = [Assign(while_test, Var(last_marker)), Assign(while_accum, Var(last_marker))]
205    repeat = [Assign(while_test, And(Var(loop_result), Not(Var(while_accum)))),
206              Assign(while_accum, Or(Var(while_accum), Var(loop_result)))]
207    loop_stmt = While(Var(while_test), loop_body_stmts ++ repeat)
208
209re2pablo_helper(Rep r lb Unbounded, cg_state) = re2pablo_helper(Rep r (lb-1) Unbounded, cg1_state)
210  where 
211   cg1_state = re2pablo_helper(r, cg_state)
212
213-- Now Bounded Repetition: use multiple copies
214   
215re2pablo_helper(Rep r 0 (UpperBound 0), cg_state) = cg_state
216
217re2pablo_helper(Rep r 0 (UpperBound ub), (env, stmts, last_marker)) = (newenv, new_stmts, newsym)
218  where 
219   (e1, s1, rep1_marker) = re2pablo_helper(r, (env, stmts, last_marker))
220   (e2, s2, rep2plus_marker) = re2pablo_helper(Rep r 0 (UpperBound(ub-1)), (e1, s1, rep1_marker))
221   (newenv, newsym) = gensym(e2, "alt_marker")
222   new_stmts = s2 ++ [Assign (newsym, Or(Var(last_marker), Var(rep2plus_marker)))]
223
224re2pablo_helper(Rep r lb (UpperBound ub), cg_state) = re2pablo_helper(Rep r (lb-1) (UpperBound(ub-1)), cg1_state)
225  where 
226   cg1_state = re2pablo_helper(r, cg_state)
227
Note: See TracBrowser for help on using the repository browser.