source: proto/RE/Haskell/CanonicalRE.hs @ 3827

Last change on this file since 3827 was 3827, checked in by cameron, 5 years ago

Initial prototype conversion to UTF8

File size: 1.6 KB
1-- Module CanonicalRE defines a canonical representation for regular expressions
2-- that uses a small number of alternative forms (e.g., combining
3-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
4-- a single Rep structure. 
6-- Robert D. Cameron, 2013
8module CanonicalRE (RE(..), RepLimit(..)) where
10import SparseCharSet
12-- RE is the data type for regular expressions
14data RE = CC SparseCharClass | Start | End | Seq [RE] | Alt [RE] | Rep (RE, Int, RepLimit)
15          deriving Show
16data RepLimit = UpperBound Int | Unbounded deriving Show
18-- In the following comments, CC String is used instead of CC SparseCharClass for
19-- illustrative purposes.
21-- CC "abcd" represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
22-- Start represents the ^ metacharacter for start of line or string matching
23-- End represents the $ metacharacter for end of line or string matching
24-- Seq [CC "abcd", CC "e", CC "f", CC "ghkl"] represents the regexp [a-d]ef[ghkl]
25-- Alt [Seq[CC "r", CC "e", CC "d"],  Seq[CC "b", CC "l", CC "u", CC "e"]] represents red|blue
26-- Rep (CC "a", 0, UpperBound 1) represents a?
27-- Rep (Seq[CC "a", CC "b", CC "c"], 0, Unbounded)  represents (abc)*, (without substring capture)
28-- Rep (CC "abcedefghijklmnopqrstuvwxyz", 1, Unbounded) represents [a-z]+
29-- Rep (CC "ab", 5, Unbounded) represents [ab]{5,}
30-- Rep (CC "ha", 5, UpperBound 10) represents [ha]{5,10}
32-- Special cases:
33-- Seq [] represents the empty regular expression, matching only the empty string.
34-- Alt [] represents the empty set, i.e., matching nothing.
Note: See TracBrowser for help on using the repository browser.