source: proto/RE/doc/re_proto.hs @ 3427

Last change on this file since 3427 was 3427, checked in by cameron, 6 years ago

removeNullablePrefix

File size: 3.7 KB
RevLine 
[3420]1-- Module re_proto is a research compiler for prototyping
2-- algorithms and ideas in regular expression processing using
3-- parallel bit streams and MatchStar.
4
5module RE_Proto (RE, minMatchLen) where
6
7-- RE is the ADT type for practical regular expressions equivalent
8-- to the POSIX ERE forms, except for substring capture. 
9
10data RE = CC String | CCnot String | Lit String | Start | End | Any |
11          Join [RE] | Alt [RE] | Opt RE | Kstar RE | Kplus RE | OrMore Int RE | Bounded Int Int RE
12   deriving Show
13-- CC "abcd" represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
14-- CCnot "abcd" represents the class of all characters except a, b, c and d, i.e., [^a-d]
15-- Lit "abcd" represents the regular expression for the 4-character string "abcd".
16-- Start represents the ^ metacharacter for start of line or string matching
17-- End represents the $ metacharacter for end of line or string matching
18-- Any represents the . metacharacter that matches any character (except end of line)
19-- Join [CC "abcd", Lit "efgh", CC "ijkl"] represents the regexp [a-d]efgh[i-l]
20-- Alt [Lit "red", Lit "green", Lit "|", Lit "blue"] represents red|green|\||blue
21-- Opt (Lit "a") represents a?
22-- Kstar (Lit "abc") represents (abc)*, (except that there is no substring capture)
23-- Kplus (CC "abcedefghijklmnopqrstuvwxyz") represents [a-z]+
24-- OrMore 5 (CC "ab") represents [ab]{5,}
25-- Bounded 5 10 (CC "ha") represents [ha]{5,10}
26-- Note: none of the characters of a CC, CCnot or Lit is every considered a metachar.
27-- Thus Lit "a|b.$" means the regexp "a\|b\.\$", for the regexp "a|b.$" use Alt[Lit "a",  Join ["b", Any, End]]
28
29-- Write a minimal match length function that determines the minimum length
30-- string that can be matched by an RE, considering that Start and End match
31-- single \n characters.
32
33minMatchLen :: RE -> Int
34minMatchLen (CC s) = 1
35minMatchLen (CCnot s) = 1
36minMatchLen (Lit s) = length s
37minMatchLen Start = 1
38minMatchLen End = 1
39minMatchLen Any = 1
40minMatchLen (Join []) = 0
41minMatchLen (Join (r:rs)) = minMatchLen r + (minMatchLen (Join rs))
42minMatchLen (Alt [r]) = minMatchLen r
43minMatchLen (Alt (r:rs)) = min (minMatchLen r) (minMatchLen (Alt rs))
44minMatchLen (Opt r) = 0
45minMatchLen (Kstar r) = 0
46minMatchLen (Kplus r) = minMatchLen r
47minMatchLen (OrMore i r) = i * (minMatchLen r)
48minMatchLen (Bounded i j r) = i * (minMatchLen r)
49
[3427]50--- removeNullablePrefix takes a regular expression and returns
51--- the version of it that has been transformed to remove any nullable
52--- prefixes.  A prefix p is nullable if it can possibly match the empty
53--- string, i.e., minMatchLen(p) = 0.  For example, in the regexp
54--- [a-z]*-[0-9]+ the prefix [a-z]* is nullable.
[3420]55
[3427]56removeNullablePrefix :: RE -> RE
57removeNullablePrefix (CC s) = (CC s)
58removeNullablePrefix (CCnot s) = (CCnot s)
59removeNullablePrefix (Lit s) = (Lit s)
60removeNullablePrefix Start = Start
61removeNullablePrefix End = End
62removeNullablePrefix Any = Any
63removeNullablePrefix (Join []) = Join []
64removeNullablePrefix (Join (r:rs))
65   | minMatchLen(r) == 0  = removeNullablePrefix(Join rs)
66   | otherwise            = Join (r:rs)
67removeNullablePrefix (Alt rs) = Alt (map removeNullablePrefix rs)
68removeNullablePrefix (Opt r) = Join []
69removeNullablePrefix (Kstar r) = Join []
70removeNullablePrefix (Kplus r)
71   | minMatchLen(r) == 0  = Join []
72   | otherwise            = Join [removeNullablePrefix(r), Kstar r]
73removeNullablePrefix (OrMore i r)
74   | minMatchLen(r) == 0  = Join []
75   | i == 0               = Join []
76   | otherwise            = Join [removeNullablePrefix(r), OrMore (i-1) r]
77removeNullablePrefix (Bounded i j r)
78   | minMatchLen(r) == 0  = Join []
79   | i == 0               = Join []
80   | otherwise            = Join [removeNullablePrefix(r), Bounded (i-1) (j-1) r]
[3420]81
Note: See TracBrowser for help on using the repository browser.