source: proto/RE/Haskell/CanonicalRE.hs @ 3614

Last change on this file since 3614 was 3614, checked in by cameron, 5 years ago

Add regexp parser; move compiler into REcompile

File size: 1.4 KB
1-- Module CanonicalRE defines a canonical representation for regular expressions
2-- that uses a small number of alternative forms (e.g., combining
3-- all forms of repetition (Kleene star, Kleene plus, ?, {m,n}) into
4-- a single Rep structure. 
6-- Robert D. Cameron, 2013
8module CanonicalRE (RE(..), RepLimit(..)) where
10-- RE is the data type for regular expressions
12data RE = CC String | Start | End | Seq [RE] | Alt [RE] | Rep (RE, Int, RepLimit)
13          deriving Show
14data RepLimit = UpperBound Int | Unbounded deriving Show
16-- CC "abcd" represents the character class with the 4 characters a, b, c and d, i.e., [a-d].
17-- Start represents the ^ metacharacter for start of line or string matching
18-- End represents the $ metacharacter for end of line or string matching
19-- Seq [CC "abcd", CC "e", CC "f", CC "ghkl"] represents the regexp [a-d]ef[ghkl]
20-- Alt [Seq[CC "r", CC "e", CC "d"],  Seq[CC "b", CC "l", CC "u", CC "e"]] represents red|blue
21-- Rep (CC "a", 0, UpperBound 1) represents a?
22-- Rep (Seq[CC "a", CC "b", CC "c"], 0, Unbounded)  represents (abc)*, (without substring capture)
23-- Rep (CC "abcedefghijklmnopqrstuvwxyz", 1, Unbounded) represents [a-z]+
24-- Rep (CC "ab", 5, Unbounded) represents [ab]{5,}
25-- Rep (CC "ha", 5, UpperBound 10) represents [ha]{5,10}
27-- Special cases:
28-- Seq [] represents the empty regular expression, matching only the empty string.
29-- Alt [] represents the empty set, i.e., matching nothing.
Note: See TracBrowser for help on using the repository browser.