source: proto/RE/Haskell/CanonicalRE.hs @ 4204

Last change on this file since 4204 was 3903, checked in by cameron, 5 years ago

Revert inadvertent check-in

File size: 3.6 KB
Line 
1-- Module CanonicalRE defines a canonical representation for regular expressions
2-- that uses a small number of alternative forms (e.g., combining
3-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
4-- a single Rep structure. 
5
6-- Robert D. Cameron, 2013
7
8module CanonicalRE (RE(..), unboundedRep, mkSeq, mkAlt, mkRep, simplify) where
9
10import SparseCharSet
11
12-- RE is the data type for regular expressions
13
14data RE = CC SparseCharClass | Start | End | Seq [RE] | Alt [RE] | Rep (RE, Int, Int)
15          deriving Show
16
17--   unbounded repetition is represented by -1
18unboundedRep :: Int
19unboundedRep = -1
20
21
22-- In the following comments, CC String is used instead of CC SparseCharClass for
23-- illustrative purposes.
24--
25-- CC "abcd" represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
26-- Start represents the ^ metacharacter for start of line or string matching
27-- End represents the $ metacharacter for end of line or string matching
28-- Seq [CC "abcd", CC "e", CC "f", CC "ghkl"] represents the regexp [a-d]ef[ghkl]
29-- Alt [Seq[CC "r", CC "e", CC "d"],  Seq[CC "b", CC "l", CC "u", CC "e"]] represents red|blue
30-- Rep (r, lb, ub) represents repetition of at least lb and at most ub occurrences of r
31-- Rep (CC "a", 0, 1) represents a?
32-- Rep (Seq[CC "a", CC "b", CC "c"], 0, unboundedRep)  represents (abc)*, (without substring capture)
33-- Rep (CC "abcedefghijklmnopqrstuvwxyz", 1, unboundedRep) represents [a-z]+
34-- Rep (CC "ab", 5, unboundedRep) represents [ab]{5,}
35-- Rep (CC "ha", 5, 10) represents [ha]{5,10}
36
37-- Special cases:
38-- Seq [] represents the empty regular expression, matching only the empty string.
39-- Alt [] represents the empty set, i.e., matching nothing.
40
41
42-- Optimizing constructors
43-- mkSeq - make a sequence, but flatten.  Result might not be a Seq.
44-- If there is only one component for a sequence, simply return that.
45mkSeq [r] = r
46mkSeq rs = Seq (mkSeqList(rs))
47
48-- Build a list for Seq, flattening subsequences
49mkSeqList [] = []
50mkSeqList ((Seq rs): more) = mkSeqList(rs ++ more)
51mkSeqList (r:rs) = r:(mkSeqList(rs))
52
53-- mkAlt make a list of alternatives, but flatten, and combine character classes
54-- If there is only one alternative, simply return that.
55mkAlt [r] = r
56mkAlt rs = Alt (mkAltList(rs))
57
58-- Build a list for Alt, flattening alternative subgroups,
59-- and combining character classes.   We move character
60-- classes towards the end of the list to ensure that
61-- all combinations are found.
62mkAltList [] = []
63mkAltList ((Alt rs): more) = mkAltList(rs ++ more)
64mkAltList (CC(cs1):CC(cs2):more) = mkAltList(CC(joinCharSets(cs1, cs2)): more)
65mkAltList (CC(cs1):a2:more) = mkAltList(a2:CC(cs1):more)
66mkAltList (r:rs) = r:(mkAltList(rs))
67
68ubCombine (h1, h2)
69  |  h1 == unboundedRep   = unboundedRep
70  |  h2 == unboundedRep   = unboundedRep
71  |  otherwise            = h1 * h2
72
73mkRep(Seq [], lb, ub) = Seq []
74mkRep(Rep(r, lb1, ub1), lb2, ub2)
75   | ub1 == unboundedRep && lb2 > 0    = Rep(r, lb1 * lb2, unboundedRep)
76   | ub1 == unboundedRep && lb1 <= 1   = Rep(r, lb1 * lb2, unboundedRep)
77   | ub1 == unboundedRep && lb2 == 0   = Rep(Rep(r, lb1, unboundedRep), 0, 1)
78   | lb2 == ub2                        = mkRep(r, lb1 * lb2, ub1 * ub2)
79   | ub1 * lb2 >= lb1 * (lb2 + 1) - 1  = mkRep(r, lb1 * lb2, ubCombine(ub1, ub2))
80   | otherwise = Rep(Rep(r, lb1, ub1), lb2, ub2)
81mkRep(r, lb, ub) 
82   | lb == 0 && ub == 0  = Seq []
83   | lb == 1 && ub == 1  = r
84   | otherwise           = Rep(r, lb, ub)
85
86
87-- Recursive simplifier
88simplify (Alt as) = mkAlt(map simplify as) 
89simplify (Seq as) = mkSeq(map simplify as)
90simplify (Rep (r, lb, ub)) = mkRep(simplify(r), lb, ub)
91simplify r = r
Note: See TracBrowser for help on using the repository browser.