source: proto/RE/Haskell/REparse.hs @ 3616

Last change on this file since 3616 was 3614, checked in by cameron, 6 years ago

Add regexp parser; move compiler into REcompile

File size: 8.3 KB
Line 
1-- Module REparse provides parsers for parsing regular expressions
2-- in different representations, converting to the CanonicalRE
3-- representation.
4
5-- Robert D. Cameron, 2014
6
7module REparse (parseRE, ParseResult(..)) where
8       
9import Data.Char
10import CanonicalRE
11
12data ParseResult = ParseSuccess RE | ParseFailure String deriving Show
13
14parseRE :: String -> ParseResult
15parseRE_helper :: String -> (ParseResult, String)
16
17-- parsing is accomplished using a recursive-descent style
18-- parsing functions, which return a partial result and a
19-- remaining input string.  At the top-level, the entire
20-- input must be consumed or the parse fails.
21
22parseRE(s) =
23  case parseRE_helper(s) of
24    (result, "")         -> result
25    (ParseSuccess _, remaining) -> ParseFailure "Junk remaining" 
26    (failresult, remain) -> failresult
27--
28-- A Simple Grammar of Regular Expressions
29--
30-- A regular expression can consist of one or more alternative
31-- forms, separated by vertical bars ("|").
32-- <RE> ::= <RE_form> { "|" <RE_form> }
33--
34-- Each alternative form is a concatenation of one or more items.
35-- <RE_form> ::= <RE_item> {<RE_items>}
36--
37-- Each item is basic unit, or a repeated item.
38-- <RE_item> ::= <RE_unit> | <RE_item> <repeat_specification>
39-- <repeat_specification> ::= '?' | '*' | '+' | '{' <int> ["," [<int>]} '}'
40-- <int> ::= <digit> {<digit>}
41--
42-- Each unit is a parenthesized RE, a character class item or
43-- a start-of-line or end-of-line assertion.
44-- <RE_unit> ::= '(' <RE> ')' | <CC> | '^' | '$'
45--
46-- A character class unit is either an ordinary character, the "any"
47-- character class represented by '.', an escaped character class
48-- or a bracketted character class.
49-- <CC> ::= <non-metacharacter> | '.'  | <escaped_CC> | <bracketted_CC>
50--
51-- An escaped character class is an escaped metacharacter or
52-- a named escape class such as \s for whitespace, \w for word...
53-- <escaped_CC> ::= '\' <metacharacter> | '\' <letter>
54--
55-- A bracketted character class is either a positive or negative
56-- opener followed by a character class body.
57--
58-- <bracketted_CC> ::= '[' <CC_body> ']' | '[^' <CC_body> ']'
59-- The character class body is a list of individual characters
60-- or ranges specified with hyphens.  "]" and "-" may not
61-- be specified as body characters except that either may
62-- occur as the first character of a class and "-" may occur
63-- as the final character.
64-- <CC_body> ::= <any_char> ['-' <body_char>] { <body_char> ['-' <body_char>]} ['-']
65--
66--
67
68parseRE_helper(s) =
69  case parseRE_alt_form_list(s) of
70    ([], _)            -> (ParseFailure "No regular expression found", s)
71    ([a], remaining)   -> (ParseSuccess a, remaining) -- a single alternative, just return it
72    (forms, remaining) -> (ParseSuccess (Alt forms), remaining)
73
74parseRE_alt_form_list :: String -> ([RE], String)
75parseRE_alt_form_list(s) =
76  case parseRE_form(s) of
77    (ParseSuccess form1, '|' : more) -> let (more_results, after_more) = parseRE_alt_form_list(more)
78                                        in (form1:more_results, after_more)
79    (ParseSuccess form1, remaining) -> ([form1], remaining)
80    _ -> ([], s)
81             
82parseRE_form :: String -> (ParseResult, String)
83parseRE_form(s) =
84   case parseRE_item_list(s) of
85     ([], _) -> (ParseFailure "No regular expression found", s) 
86     ([item], remaining) -> (ParseSuccess item, remaining)
87     (items, remaining) -> (ParseSuccess (Seq items), remaining)
88
89
90parseRE_item_list :: String -> ([RE], String)
91parseRE_item_list(s) =
92  case parseRE_item(s) of
93    (ParseSuccess form1, more) -> let (more_results, after_more) = parseRE_item_list(more)
94                                        in (form1:more_results, after_more)
95    _ -> ([], s)
96
97parseRE_item :: String -> (ParseResult, String)
98parseRE_item(s) =
99  case parseRE_unit(s) of
100    (ParseSuccess form1, remaining) -> extend_item(form1, remaining)
101    failure -> failure
102
103extend_item :: (RE, String) -> (ParseResult, String)
104extend_item(r, '*':more) = extend_item(Rep(r, 0, Unbounded), more)
105extend_item(r, '?':more) = extend_item(Rep(r, 0, UpperBound 1), more)
106extend_item(r, '+':more) = extend_item(Rep(r, 1, Unbounded), more)
107extend_item(r, '{':more) = 
108  case parseInt(more) of
109    (Just i, '}' : even_more) -> extend_item(Rep(r, i, UpperBound i), even_more)
110    (Just i, ',':'}': even_more) -> extend_item(Rep(r, i, Unbounded), even_more)
111    (Just i, ',': even_more) -> 
112      case parseInt(even_more) of
113        (Just j, '}' : remaining) -> extend_item(Rep(r, i, UpperBound j), remaining)
114        _ -> (ParseFailure "Bad upper bound", even_more)
115    _ -> (ParseFailure "Bad lower bound", more)
116-- default if we cannot extend
117extend_item(r, remaining) = (ParseSuccess r, remaining)
118
119
120parseRE_unit :: String -> (ParseResult, String)
121parseRE_unit([]) = (ParseFailure "Incomplete regular expression", "")
122parseRE_unit ('(':more) = 
123  case parseRE_helper(more) of
124    (ParseSuccess r, ')': remaining) -> (ParseSuccess r, remaining)
125    _ -> (ParseFailure "Bad parenthesized RE", more)
126parseRE_unit('^':more) = (ParseSuccess Start, more)
127parseRE_unit('$':more) = (ParseSuccess End, more)
128-- Now look for a single character or character class
129parseRE_unit(s) = parseCC(s)
130
131-- parseCC deals with individual characters (unitary character classes)
132-- and all other forms specifying classes of characters.
133parseCC('.':more) = (ParseSuccess (CC (map chr ([0..9]++[11..127]))), more)
134--
135-- Any of the RE metacharacters may be represented using a backslash escape.
136--
137parseCC('\\':'?':more) = (ParseSuccess (CC("?")), more)
138parseCC('\\':'+':more) = (ParseSuccess (CC("+")), more)
139parseCC('\\':'*':more) = (ParseSuccess (CC("*")), more)
140parseCC('\\':'(':more) = (ParseSuccess (CC("(")), more)
141parseCC('\\':')':more) = (ParseSuccess (CC(")")), more)
142parseCC('\\':'{':more) = (ParseSuccess (CC("{")), more)
143parseCC('\\':'}':more) = (ParseSuccess (CC("}")), more)
144parseCC('\\':'[':more) = (ParseSuccess (CC("[")), more)
145parseCC('\\':']':more) = (ParseSuccess (CC("]")), more)
146parseCC('\\':'|':more) = (ParseSuccess (CC("|")), more)
147parseCC('\\':'.':more) = (ParseSuccess (CC(".")), more)
148parseCC('\\':'\\':more) = (ParseSuccess (CC("\\")), more)
149--
150-- Any other use of backslash is an error.
151--
152parseCC('\\':more) = (ParseFailure "Illegal backslash escape", more)
153--
154
155parseCC('[':'^':more) = negateCharClassResult(parseCC_body(more))
156parseCC('[':more) = parseCC_body(more)
157--
158-- Now just have a single character, but it cannot be a metacharacter.
159parseCC(c:more)
160   | elem c "?+*(){}[]|"   = (ParseFailure "Metacharacter alone", c:more)
161   | otherwise             = (ParseSuccess (CC [c]), more)
162
163
164--
165-- To parse the body of a character class, we use helpers
166-- parseCC_body1(c, s)
167parseCC_body :: String -> (ParseResult, String)
168parseCC_body1 :: (Char, String, String) -> (ParseResult, String)
169parseCC_body0 :: (String, String) -> (ParseResult, String)
170
171parseCC_body([]) = (ParseFailure "Unclosed character class", [])
172parseCC_body(a:more) = parseCC_body1(a, more, [])
173
174
175
176parseCC_body1(a, [], ccSoFar) = (ParseFailure "Unclosed character class", [])
177parseCC_body1(a, ']':more, ccSoFar) = (ParseSuccess (CC (ccSoFar ++ [a])), more)
178parseCC_body1(a, [_], ccSoFar) = (ParseFailure "Unclosed character class", [])
179parseCC_body1(a, '-':']':more, ccSoFar) = (ParseSuccess (CC [a, '-']), more)
180parseCC_body1(a, '-':b:more, ccSoFar) = parseCC_body0(more, ccSoFar ++ (map chr [ord(a)..ord(b)]))
181parseCC_body1(a, b:more, ccSoFar) = parseCC_body1(b, more, ccSoFar ++ [a])
182
183parseCC_body0([], ccSoFar) = (ParseFailure "Unclosed character class", [])
184parseCC_body0(']':more, ccSoFar) = (ParseSuccess (CC ccSoFar), more)
185parseCC_body0('-':']':more, ccSoFar) = (ParseSuccess (CC (ccSoFar ++ ['-'])), more)
186parseCC_body0('-':more, ccSoFar) = (ParseFailure "Bad range in character class", more)
187parseCC_body0(a:more, ccSoFar) = parseCC_body1(a, more, ccSoFar)
188
189
190negateCharClassResult(ParseSuccess (CC s), remaining) = 
191   (ParseSuccess (CC (filter (\c -> not(elem c s)) (map chr ([0..9]++[11..127])))), remaining)
192negateCharClassResult(failureResult, remaining) = (failureResult, remaining)
193
194
195
196parseInt1 :: (String, Int) -> (Int, String)
197parseInt1([],numSoFar) = (numSoFar, [])
198parseInt1(d:more,numSoFar) 
199  | isDigit(d)    = parseInt1(more, numSoFar * 10 + (ord(d) - 48))
200  | otherwise     = (numSoFar, d:more)
201
202parseInt :: String -> (Maybe Int, String)
203parseInt([]) = (Nothing, [])
204parseInt(d:more)
205  | isDigit(d)    = let (i, remaining) = parseInt1(more, ord(d) - 48) in (Just i, remaining)
206  | otherwise     = (Nothing, d:more)
207 
Note: See TracBrowser for help on using the repository browser.