source: proto/RE/Haskell/CanonicalRE.hs @ 3902

Last change on this file since 3902 was 3902, checked in by cameron, 5 years ago

Avoid duplicating the repeated expression, if it can't be simplified

File size: 3.8 KB
Line 
1-- Module CanonicalRE defines a canonical representation for regular expressions
2-- that uses a small number of alternative forms (e.g., combining
3-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
4-- a single Rep structure. 
5
6-- Robert D. Cameron, 2013
7
8module CanonicalRE (RE(..), unboundedRep, mkSeq, mkAlt, mkRep, simplify) where
9
10import SparseCharSet
11
12-- RE is the data type for regular expressions
13
14data RE = CC (Bool, SparseCharClass) | Start | End | Seq [RE] | Alt [RE] | Rep (RE, Int, Int)
15          deriving Show
16
17--   unbounded repetition is represented by -1
18unboundedRep :: Int
19unboundedRep = -1
20
21
22-- In the following comments, CC String is used instead of CC SparseCharClass for
23-- illustrative purposes.
24--
25-- CC (True, [CharRange(97,100)]) represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
26-- CC (False, [CharRange(97,100)]) represents the character class with all characters but a, b, c and d, i.e., [^a-d].
27-- Start represents the ^ metacharacter for start of line or string matching
28-- End represents the $ metacharacter for end of line or string matching
29-- Seq [CC "abcd", CC "e", CC "f", CC "ghkl"] represents the regexp [a-d]ef[ghkl]
30-- Alt [Seq[CC "r", CC "e", CC "d"],  Seq[CC "b", CC "l", CC "u", CC "e"]] represents red|blue
31-- Rep (r, lb, ub) represents repetition of at least lb and at most ub occurrences of r
32-- Rep (CC "a", 0, 1) represents a?
33-- Rep (Seq[CC "a", CC "b", CC "c"], 0, unboundedRep)  represents (abc)*, (without substring capture)
34-- Rep (CC "abcedefghijklmnopqrstuvwxyz", 1, unboundedRep) represents [a-z]+
35-- Rep (CC "ab", 5, unboundedRep) represents [ab]{5,}
36-- Rep (CC "ha", 5, 10) represents [ha]{5,10}
37
38-- Special cases:
39-- Seq [] represents the empty regular expression, matching only the empty string.
40-- Alt [] represents the empty set, i.e., matching nothing.
41
42
43-- Optimizing constructors
44-- mkSeq - make a sequence, but flatten.  Result might not be a Seq.
45-- If there is only one component for a sequence, simply return that.
46mkSeq [r] = r
47mkSeq rs = Seq (mkSeqList(rs))
48
49-- Build a list for Seq, flattening subsequences
50mkSeqList [] = []
51mkSeqList ((Seq rs): more) = mkSeqList(rs ++ more)
52mkSeqList (r:rs) = r:(mkSeqList(rs))
53
54-- mkAlt make a list of alternatives, but flatten, and combine character classes
55-- If there is only one alternative, simply return that.
56mkAlt [r] = r
57mkAlt rs = Alt (mkAltList(rs))
58
59-- Build a list for Alt, flattening alternative subgroups,
60-- and combining character classes.   We move character
61-- classes towards the end of the list to ensure that
62-- all combinations are found.
63mkAltList [] = []
64mkAltList ((Alt rs): more) = mkAltList(rs ++ more)
65mkAltList (CC(True, cs1):CC(True, cs2):more) = mkAltList(CC(True, joinCharSets(cs1, cs2)): more)
66mkAltList (CC(False, cs1):CC(False, cs2):more) = mkAltList(CC(False, joinCharSets(cs1, cs2)): more)
67mkAltList (CC(cs1):a2:more) = mkAltList(a2:CC(cs1):more)
68mkAltList (r:rs) = r:(mkAltList(rs))
69
70ubCombine (h1, h2)
71  |  h1 == unboundedRep   = unboundedRep
72  |  h2 == unboundedRep   = unboundedRep
73  |  otherwise            = h1 * h2
74
75mkRep(Seq [], lb, ub) = Seq []
76mkRep(Rep(r, lb1, ub1), lb2, ub2)
77   | ub1 == unboundedRep && lb2 > 0    = Rep(r, lb1 * lb2, unboundedRep)
78   | ub1 == unboundedRep && lb1 <= 1   = Rep(r, lb1 * lb2, unboundedRep)
79   | ub1 == unboundedRep && lb2 == 0   = Rep(Rep(r, lb1, unboundedRep), 0, 1)
80   | lb2 == ub2                        = mkRep(r, lb1 * lb2, ub1 * ub2)
81   | ub1 * lb2 >= lb1 * (lb2 + 1) - 1  = mkRep(r, lb1 * lb2, ubCombine(ub1, ub2))
82   | otherwise = Rep(Rep(r, lb1, ub1), lb2, ub2)
83mkRep(r, lb, ub) 
84   | lb == 0 && ub == 0  = Seq []
85   | lb == 1 && ub == 1  = r
86   | otherwise           = Rep(r, lb, ub)
87
88
89-- Recursive simplifier
90simplify (Alt as) = mkAlt(map simplify as) 
91simplify (Seq as) = mkSeq(map simplify as)
92simplify (Rep (r, lb, ub)) = mkRep(simplify(r), lb, ub)
93simplify r = r
Note: See TracBrowser for help on using the repository browser.