source: proto/RE/doc/CanonicalRE.hs @ 3588

Last change on this file since 3588 was 3588, checked in by cameron, 6 years ago

Canonical RE demo

File size: 2.7 KB
Line 
1-- Module Canonical_RE is a canonical representation for regular
2-- expressions that uses a minimum number of alternative forms,
3-- for example, in comparison with module RE_Proto.
4-- Use of this canonical representation should simplify transformation and
5-- analysis algorithms. 
6-- Compare for example, RE_proto.minMatchLen with CanonicalRE.minMatchLen
7
8
9module CanonicalRE (RE(..), minMatchLen, removeNullablePrefix) where
10       
11import qualified RE_Proto as Proto
12import Data.Char
13
14data RE = CC String | Seq [RE] | Alt [RE] | Rep RE Int Int |
15          Start | End  -- lookbehind/lookahead assertions for newline
16
17
18canonicalize :: Proto.RE -> RE
19canonicalize (Proto.CC s) = CC s
20canonicalize (Proto.CCnot s) =  CC (filter (\c -> not(elem c s)) (map chr ([0..9]++[11..127])))
21canonicalize (Proto.Lit s) = Seq (map (\c -> CC [c]) s)
22canonicalize Proto.Start = Start
23canonicalize Proto.End = End
24canonicalize Proto.Any = CC (map chr ([0..9]++[11..127]))
25canonicalize (Proto.Join rs) = Seq (map canonicalize rs)
26canonicalize (Proto.Alt rs) = Alt (map canonicalize rs)
27canonicalize (Proto.Opt r) = Rep (canonicalize r) 0 1
28canonicalize (Proto.Kstar r) = Rep (canonicalize r) 0 (-1)
29canonicalize (Proto.Kplus r) = Rep (canonicalize r) 1 (-1)
30canonicalize (Proto.OrMore i r) = Rep (canonicalize r) i (-1)
31canonicalize (Proto.Bounded i j r) = Rep (canonicalize r) i j
32
33
34-- Write a minimal match length function that determines the minimum length
35-- string that can be matched by an RE, considering that Start and End match
36-- single \n characterProto.
37
38minMatchLen :: RE -> Int
39minMatchLen (CC s) = 1
40minMatchLen (Seq []) = 0
41minMatchLen (Seq (r:rs)) = minMatchLen r + (minMatchLen (Seq rs))
42minMatchLen (Alt [r]) = minMatchLen r
43minMatchLen (Alt (r:rs)) = min (minMatchLen r) (minMatchLen (Alt rs))
44minMatchLen (Rep r i j) = i * (minMatchLen r)
45
46--- removeNullablePrefix takes a regular expression and returns
47--- the version of it that has been transformed to remove any nullable
48--- prefixeProto.  A prefix p is nullable if it can possibly match the empty
49--- string, i.e., minMatchLen(p) = 0.  For example, in the regexp
50--- [a-z]*-[0-9]+ the prefix [a-z]* is nullable.
51
52removeNullablePrefix :: RE -> RE
53removeNullablePrefix (CC s) = (CC s)
54removeNullablePrefix Start = Start
55removeNullablePrefix End = End
56removeNullablePrefix (Seq []) = Seq []
57removeNullablePrefix (Seq (r:rs))
58   | minMatchLen(r) == 0  = removeNullablePrefix(Seq rs)
59   | otherwise            = Seq ((removeNullablePrefix r):rs)
60removeNullablePrefix (Alt rs) = Alt (map removeNullablePrefix rs)
61removeNullablePrefix (Rep r i j)
62   | minMatchLen(r) == 0  = Seq []
63   | i == 0               = Seq []
64   | otherwise            = Seq [removeNullablePrefix(r), Rep r (i-1) (j-1)]
65
Note: See TracBrowser for help on using the repository browser.