source: proto/RE/doc/CanonicalRE.hs @ 3589

Last change on this file since 3589 was 3589, checked in by cameron, 5 years ago

Canonical RE demo

File size: 2.7 KB
Line 
1-- Module Canonical_RE is a canonical representation for regular
2-- expressions that uses a minimum number of alternative forms,
3-- for example, in comparison with module RE_Proto.
4-- Use of this canonical representation should simplify transformation and
5-- analysis algorithms. 
6-- Compare for example, RE_proto.minMatchLen with CanonicalRE.minMatchLen
7
8
9module CanonicalRE (RE(..), canonicalize, minMatchLen, removeNullablePrefix) where
10       
11import qualified RE_Proto as Proto
12import Data.Char
13
14data RE = CC String | Seq [RE] | Alt [RE] | Rep RE Int Int |
15          Start | End  -- lookbehind/lookahead assertions for newline
16
17
18canonicalize :: Proto.RE -> RE
19canonicalize (Proto.CC s) = CC s
20canonicalize (Proto.CCnot s) =  CC (filter (\c -> not(elem c s)) (map chr ([0..9]++[11..127])))
21canonicalize (Proto.Lit s) = Seq (map (\c -> CC [c]) s)
22canonicalize Proto.Start = Start
23canonicalize Proto.End = End
24canonicalize Proto.Any = CC (map chr ([0..9]++[11..127]))
25canonicalize (Proto.Join rs) = Seq (map canonicalize rs)
26canonicalize (Proto.Alt rs) = Alt (map canonicalize rs)
27canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 1
28canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 (-1)
29canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 (-1)
30canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i (-1)
31canonicalize (Proto.Bounded i j r) = Rep (canonicalize r) i j
32
33
34-- Write a minimal match length function that determines the minimum length
35-- string that can be matched by an RE, considering that Start and End match
36-- single \n characterProto.
37
38minMatchLen :: RE -> Int
39minMatchLen (CC s) = 1
40minMatchLen Start = 1
41minMatchLen End = 1
42minMatchLen (Seq []) = 0
43minMatchLen (Seq (r:rs)) = minMatchLen r + (minMatchLen (Seq rs))
44minMatchLen (Alt [r]) = minMatchLen r
45minMatchLen (Alt (r:rs)) = min (minMatchLen r) (minMatchLen (Alt rs))
46minMatchLen (Rep r i j) = i * (minMatchLen r)
47
48--- removeNullablePrefix takes a regular expression and returns
49--- the version of it that has been transformed to remove any nullable
50--- prefixeProto.  A prefix p is nullable if it can possibly match the empty
51--- string, i.e., minMatchLen(p) = 0.  For example, in the regexp
52--- [a-z]*-[0-9]+ the prefix [a-z]* is nullable.
53
54removeNullablePrefix :: RE -> RE
55removeNullablePrefix (CC s) = (CC s)
56removeNullablePrefix Start = Start
57removeNullablePrefix End = End
58removeNullablePrefix (Seq []) = Seq []
59removeNullablePrefix (Seq (r:rs))
60   | minMatchLen(r) == 0  = removeNullablePrefix(Seq rs)
61   | otherwise            = Seq ((removeNullablePrefix r):rs)
62removeNullablePrefix (Alt rs) = Alt (map removeNullablePrefix rs)
63removeNullablePrefix (Rep r i j)
64   | minMatchLen(r) == 0  = Seq []
65   | i == 0               = Seq []
66   | otherwise            = Seq [removeNullablePrefix(r), Rep r (i-1) (j-1)]
67
Note: See TracBrowser for help on using the repository browser.