source: proto/RE/doc/CanonicalRE.hs @ 3594

Last change on this file since 3594 was 3594, checked in by cameron, 6 years ago

Fix syntax errors

File size: 5.9 KB
Line 
1-- Module Canonical_RE is a canonical representation for regular
2-- expressions that uses a minimum number of alternative forms,
3-- for example, in comparison with module RE_Proto.
4-- Use of this canonical representation should simplify transformation and
5-- analysis algorithms. 
6-- Compare for example, RE_proto.minMatchLen with CanonicalRE.minMatchLen
7
8
9module CanonicalRE (RE(..), canonicalize, minMatchLen, removeNullablePrefix) where
10       
11import qualified RE_Proto as Proto
12import Data.Char
13
14data RepLimit = UpperBound Int | Unbounded deriving Show
15upper_bound_dec (UpperBound j) = UpperBound (j - 1)
16upper_bound_dec Unbounded = Unbounded
17
18data RE = CC String | Seq [RE] | Alt [RE] | Rep RE Int RepLimit |
19          Start | End  -- lookbehind/lookahead assertions for newline
20-- Rep r i j represents bounded repetition, with lower bound i and
21-- upper bound j.   We use the convention that any negative value of
22-- j represents unbounded repetition.
23
24canonicalize :: Proto.RE -> RE
25canonicalize (Proto.CC s) = CC s
26canonicalize (Proto.CCnot s) =  CC (filter (\c -> not(elem c s)) (map chr ([0..9]++[11..127])))
27canonicalize (Proto.Lit s) = Seq (map (\c -> CC [c]) s)
28canonicalize Proto.Start = Start
29canonicalize Proto.End = End
30canonicalize Proto.Any = CC (map chr ([0..9]++[11..127]))
31canonicalize (Proto.Join rs) = Seq (map canonicalize rs)
32canonicalize (Proto.Alt rs) = Alt (map canonicalize rs)
33canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 (UpperBound 1)
34canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 Unbounded
35canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 Unbounded
36canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i Unbounded
37canonicalize (Proto.Bounded i j r) = Rep (canonicalize r) i (UpperBound j)
38
39
40-- Write a minimal match length function that determines the minimum length
41-- string that can be matched by an RE, considering that Start and End match
42-- single \n characterProto.
43
44minMatchLen :: RE -> Int
45minMatchLen (CC s) = 1
46minMatchLen Start = 1
47minMatchLen End = 1
48minMatchLen (Seq []) = 0
49minMatchLen (Seq (r:rs)) = minMatchLen r + (minMatchLen (Seq rs))
50minMatchLen (Alt [r]) = minMatchLen r
51minMatchLen (Alt (r:rs)) = min (minMatchLen r) (minMatchLen (Alt rs))
52minMatchLen (Rep r i j) = i * (minMatchLen r)
53
54--- removeNullablePrefix takes a regular expression and returns
55--- the version of it that has been transformed to remove any nullable
56--- prefixeProto.  A prefix p is nullable if it can possibly match the empty
57--- string, i.e., minMatchLen(p) = 0.  For example, in the regexp
58--- [a-z]*-[0-9]+ the prefix [a-z]* is nullable.
59
60removeNullablePrefix :: RE -> RE
61removeNullablePrefix (CC s) = (CC s)
62removeNullablePrefix Start = Start
63removeNullablePrefix End = End
64removeNullablePrefix (Seq []) = Seq []
65removeNullablePrefix (Seq (r:rs))
66   | minMatchLen(r) == 0  = removeNullablePrefix(Seq rs)
67   | otherwise            = Seq ((removeNullablePrefix r):rs)
68removeNullablePrefix (Alt rs) = Alt (map removeNullablePrefix rs)
69removeNullablePrefix (Rep r 0 ub) = Seq []
70removeNullablePrefix (Rep r lb ub)
71   | minMatchLen(r) == 0  = Seq []
72   | otherwise            = Seq [removeNullablePrefix(r), Rep r (lb-1) (upper_bound_dec ub)]
73
74
75--
76-- Regular Expression Compiler
77
78data PabloE = Marker(Int) | And(PabloE, PabloE) | Or(PabloE, PabloE) | Not(PabloE)
79              | CharClass(String) | Advance(PabloE) | MatchStar(PabloE, PabloE) 
80   deriving Show 
81
82data PabloS = SetMarker(Int, PabloE) | ReturnMarker(Int)
83              | If (PabloE, [PabloS])| While (PabloE, [PabloS]) 
84   deriving Show
85
86compile :: RE -> [PabloS] 
87
88compile(e) = stmts ++ [ReturnMarker m]
89  where (m, stmts) = re2pablo_helper(e, 0)
90
91re2pablo_helper :: (RE, Int) -> (Int, [PabloS])
92
93-- To match a character class, use bitwise and of the marker and
94-- character class, then advance 1.  The output marker variable
95-- is the next one in sequence.
96
97mre2pablo_helper(CC(c), m) = 
98   (m + 1, [SetMarker (m + 1, Advance(And(Marker(m), CharClass(c))))])
99
100-- To match "^" we must be at the start of line, i.e., one past
101-- a newline character or at the beginning of file.  We use the
102-- trick of advancing the negated newline stream to determine these
103-- positions.
104--
105re2pablo_helper(Start, m) =
106  (m+1, [SetMarker(m+1, And(Marker m, Not(Advance(Not(CharClass("\n"))))))])
107
108-- To match "$" we must have reached end of line.
109--
110re2pablo_helper(End, m) = 
111  (m+1, [SetMarker(m+1, And(Marker m, CharClass("\n")))])
112
113-- Seq [] is the empty regexp which matches the empty string. 
114-- We just return the current marker unchanged, with no statements
115-- needed to compute it.
116--
117re2pablo_helper(Seq [], m) = (m, [])
118
119-- Seq (r1:rs): match r1, the first subexpression, then match
120-- the rest, concatenating the statements.
121re2pablo_helper(Seq (r1:rs), m) = (m2, s1 ++ s2)
122  where 
123   (m1, s1) = mre2pablo_helper(r1, m)
124   (m2, s2) = mre2pablo_helper(Seq rs, m1)
125
126-- Alt[r] has a single alternative r to match, just match
127-- it with no other possibility.
128re2pablo_helper(Alt [r], m) = re2pablo_helper(r, m)
129
130-- Alt (r1:rs): we have r1 and some other alternatives, the
131-- match succeeds if either r1 matches or any of the others
132-- does.
133
134re2pablo_helper(Alt (r1:rs), m) = (m2 + 1, s1 ++ s2 ++ [SetMarker (m2 + 1, Or(Marker m1, Marker m2))])
135  where 
136   (m1, s1) = mre2pablo_helper(r1, m)
137   (m2, s2) = mre2pablo_helper(Alt rs, m1)
138
139re2pablo_helper(Rep (CC c) 0 Unbounded, m) = 
140   (m + 1, [SetMarker(m + 1, MatchStar(Marker(m), CharClass(c)))]) 
141
142re2pablo_helper(Rep r 0 ub, m)
143  | ub < 0 = (m1, [SetMarker(m + 1, Marker m), While (Marker (m+1), s1 ++ [SetMarker(m + 1, And(Marker m1 ,Not(Marker (m+1))))] )])
144  | otherwise = (m + 1, s1 ++ s2 ++ [SetMarker (m + 1, Or(Marker m1, Marker m2))])
145  where 
146   (m1, s1) = mre2pablo_helper(r, m+1)
147   (m2, s2) = mre2pablo_helper(Rep r 0 (upper_bound_dec ub), m1)
148
149re2pablo_helper(Rep r lb ub, m) =  (m2, s1 ++ s2)
150  where 
151   (m1, s1) = mre2pablo_helper(r, m)
152   (m2, s2) = mre2pablo_helper(Rep r (lb - 1) (upper_bound_dec ub), m1)
153
Note: See TracBrowser for help on using the repository browser.