source: proto/RE/Haskell/REparse.hs @ 4204

Last change on this file since 4204 was 3903, checked in by cameron, 5 years ago

Revert inadvertent check-in

File size: 8.7 KB
Line 
1-- Module REparse provides parsers for parsing regular expressions
2-- in different representations, converting to the CanonicalRE
3-- representation.
4
5-- Robert D. Cameron, 2014
6
7module REparse (parseRE, ParseResult(..)) where
8       
9import Data.Char
10import SparseCharSet
11import CanonicalRE
12
13data ParseResult = ParseSuccess RE | ParseFailure String deriving Show
14
15parseRE :: String -> ParseResult
16parseRE_helper :: String -> (ParseResult, String)
17
18-- parsing is accomplished using a recursive-descent style
19-- parsing functions, which return a partial result and a
20-- remaining input string.  At the top-level, the entire
21-- input must be consumed or the parse fails.
22
23parseRE(s) =
24  case parseRE_helper(s) of
25    (result, "")         -> result
26    (ParseSuccess _, remaining) -> ParseFailure "Junk remaining" 
27    (failresult, remain) -> failresult
28--
29-- A Simple Grammar of Regular Expressions
30--
31-- A regular expression can consist of one or more alternative
32-- forms, separated by vertical bars ("|").
33-- <RE> ::= <RE_form> { "|" <RE_form> }
34--
35-- Each alternative form is a concatenation of one or more items.
36-- <RE_form> ::= <RE_item> {<RE_items>}
37--
38-- Each item is basic unit, or a repeated item.
39-- <RE_item> ::= <RE_unit> | <RE_item> <repeat_specification>
40-- <repeat_specification> ::= '?' | '*' | '+' | '{' <int> ["," [<int>]} '}'
41-- <int> ::= <digit> {<digit>}
42--
43-- Each unit is a parenthesized RE, a character class item or
44-- a start-of-line or end-of-line assertion.
45-- <RE_unit> ::= '(' <RE> ')' | <CC> | '^' | '$'
46--
47-- A character class unit is either an ordinary character, the "any"
48-- character class represented by '.', an escaped character class
49-- or a bracketted character class.
50-- <CC> ::= <non-metacharacter> | '.'  | <escaped_CC> | <bracketted_CC>
51--
52-- An escaped character class is an escaped metacharacter or
53-- a named escape class such as \s for whitespace, \w for word...
54-- <escaped_CC> ::= '\' <metacharacter> | '\' <letter>
55--
56-- A bracketted character class is either a positive or negative
57-- opener followed by a character class body.
58--
59-- <bracketted_CC> ::= '[' <CC_body> ']' | '[^' <CC_body> ']'
60-- The character class body is a list of individual characters
61-- or ranges specified with hyphens.  "]" and "-" may not
62-- be specified as body characters except that either may
63-- occur as the first character of a class and "-" may occur
64-- as the final character.
65-- <CC_body> ::= <any_char> ['-' <body_char>] { <body_char> ['-' <body_char>]} ['-']
66--
67--
68
69parseRE_helper(s) =
70  case parseRE_alt_form_list(s) of
71    ([], _)            -> (ParseFailure "No regular expression found", s)
72    ([a], remaining)   -> (ParseSuccess a, remaining) -- a single alternative, just return it
73    (forms, remaining) -> (ParseSuccess (Alt forms), remaining)
74
75parseRE_alt_form_list :: String -> ([RE], String)
76parseRE_alt_form_list(s) =
77  case parseRE_form(s) of
78    (ParseSuccess form1, '|' : more) -> let (more_results, after_more) = parseRE_alt_form_list(more)
79                                        in (form1:more_results, after_more)
80    (ParseSuccess form1, remaining) -> ([form1], remaining)
81    _ -> ([], s)
82             
83parseRE_form :: String -> (ParseResult, String)
84parseRE_form(s) =
85   case parseRE_item_list(s) of
86     ([], _) -> (ParseFailure "No regular expression found", s) 
87     ([item], remaining) -> (ParseSuccess item, remaining)
88     (items, remaining) -> (ParseSuccess (Seq items), remaining)
89
90
91parseRE_item_list :: String -> ([RE], String)
92parseRE_item_list(s) =
93  case parseRE_item(s) of
94    (ParseSuccess form1, more) -> let (more_results, after_more) = parseRE_item_list(more)
95                                        in (form1:more_results, after_more)
96    _ -> ([], s)
97
98parseRE_item :: String -> (ParseResult, String)
99parseRE_item(s) =
100  case parseRE_unit(s) of
101    (ParseSuccess form1, remaining) -> extend_item(form1, remaining)
102    failure -> failure
103
104extend_item :: (RE, String) -> (ParseResult, String)
105extend_item(r, '*':more) = extend_item(Rep(r, 0, unboundedRep), more)
106extend_item(r, '?':more) = extend_item(Rep(r, 0, 1), more)
107extend_item(r, '+':more) = extend_item(Rep(r, 1, unboundedRep), more)
108extend_item(r, '{':more) = 
109  case parseInt(more) of
110    (Just i, '}' : even_more) -> extend_item(Rep(r, i, i), even_more)
111    (Just i, ',':'}': even_more) -> extend_item(Rep(r, i, unboundedRep), even_more)
112    (Just i, ',': even_more) -> 
113      case parseInt(even_more) of
114        (Just j, '}' : remaining) -> extend_item(Rep(r, i, j), remaining)
115        _ -> (ParseFailure "Bad upper bound", even_more)
116    _ -> (ParseFailure "Bad lower bound", more)
117-- default if we cannot extend
118extend_item(r, remaining) = (ParseSuccess r, remaining)
119
120
121parseRE_unit :: String -> (ParseResult, String)
122parseRE_unit([]) = (ParseFailure "Incomplete regular expression", "")
123parseRE_unit ('(':more) = 
124  case parseRE_helper(more) of
125    (ParseSuccess r, ')': remaining) -> (ParseSuccess r, remaining)
126    _ -> (ParseFailure "Bad parenthesized RE", more)
127parseRE_unit('^':more) = (ParseSuccess Start, more)
128parseRE_unit('$':more) = (ParseSuccess End, more)
129-- Now look for a single character or character class
130parseRE_unit(s) = parseCC(s)
131
132-- Make a character class from a single character
133cc1 c = let v = ord c in CC [CharRange(v, v)]
134
135-- parseCC deals with individual characters (unitary character classes)
136-- and all other forms specifying classes of characters.
137parseCC('.':more) = (ParseSuccess (CC [CharRange(0,9), CharRange(11,127)]), more)
138--
139-- Any of the RE metacharacters may be represented using a backslash escape.
140--
141parseCC('\\':'?':more) = (ParseSuccess (cc1('?')), more)
142parseCC('\\':'+':more) = (ParseSuccess (cc1('+')), more)
143parseCC('\\':'*':more) = (ParseSuccess (cc1('*')), more)
144parseCC('\\':'(':more) = (ParseSuccess (cc1('(')), more)
145parseCC('\\':')':more) = (ParseSuccess (cc1(')')), more)
146parseCC('\\':'{':more) = (ParseSuccess (cc1('{')), more)
147parseCC('\\':'}':more) = (ParseSuccess (cc1('}')), more)
148parseCC('\\':'[':more) = (ParseSuccess (cc1('[')), more)
149parseCC('\\':']':more) = (ParseSuccess (cc1(']')), more)
150parseCC('\\':'|':more) = (ParseSuccess (cc1('|')), more)
151parseCC('\\':'.':more) = (ParseSuccess (cc1('.')), more)
152parseCC('\\':'\\':more) = (ParseSuccess (cc1('\\')), more)
153--
154-- Any other use of backslash is an error.
155--
156parseCC('\\':more) = (ParseFailure "Illegal backslash escape", more)
157--
158
159parseCC('[':'^':more) = negateCharClassResult(parseCC_body(more))
160parseCC('[':more) = parseCC_body(more)
161--
162-- Now just have a single character, but it cannot be a metacharacter.
163parseCC(c:more)
164   | elem c "?+*(){}[]|"   = (ParseFailure "Metacharacter alone", c:more)
165   | otherwise             = (ParseSuccess (cc1 c), more)
166
167
168--
169-- To parse the body of a character class, we use helpers
170-- parseCC_body1(c, s)
171parseCC_body :: String -> (ParseResult, String)
172parseCC_body1 :: (Char, String, SparseCharClass) -> (ParseResult, String)
173parseCC_body0 :: (String, SparseCharClass) -> (ParseResult, String)
174
175parseCC_body([]) = (ParseFailure "Unclosed character class", [])
176parseCC_body(a:more) = parseCC_body1(a, more, [])
177
178
179
180parseCC_body1(a, [], ccSoFar) = (ParseFailure "Unclosed character class", [])
181parseCC_body1(a, ']':more, ccSoFar) = (ParseSuccess (CC (insert1(ord(a), ccSoFar))), more)
182parseCC_body1(a, [_], ccSoFar) = (ParseFailure "Unclosed character class", [])
183parseCC_body1(a, '-':']':more, ccSoFar) = (ParseSuccess (CC (insert1(ord(a), insert1(ord('-'), ccSoFar)))), more)
184parseCC_body1(a, '-':b:more, ccSoFar) = parseCC_body0(more, insertRange(ord(a), ord(b), ccSoFar))
185parseCC_body1(a, b:more, ccSoFar) = parseCC_body1(b, more, insert1(ord(a), ccSoFar))
186
187parseCC_body0([], ccSoFar) = (ParseFailure "Unclosed character class", [])
188parseCC_body0(']':more, ccSoFar) = (ParseSuccess (CC ccSoFar), more)
189parseCC_body0('-':']':more, ccSoFar) = (ParseSuccess (CC (insert1(ord('-'), ccSoFar))), more)
190parseCC_body0('-':more, ccSoFar) = (ParseFailure "Bad range in character class", more)
191parseCC_body0(a:more, ccSoFar) = parseCC_body1(a, more, ccSoFar)
192
193
194negateCharClassResult(ParseSuccess (CC s), remaining) = 
195-- should be the following, but there are issues
196--   (ParseSuccess (CC (negateClass s)), remaining)
197-- We use the following hack for ASCII for now
198   (ParseSuccess (CC (negateClass (insert1(10, insertRange(128, 0x10FFFF, s))))), remaining) 
199negateCharClassResult(failureResult, remaining) = (failureResult, remaining)
200
201
202
203parseInt1 :: (String, Int) -> (Int, String)
204parseInt1([],numSoFar) = (numSoFar, [])
205parseInt1(d:more,numSoFar) 
206  | isDigit(d)    = parseInt1(more, numSoFar * 10 + (ord(d) - 48))
207  | otherwise     = (numSoFar, d:more)
208
209parseInt :: String -> (Maybe Int, String)
210parseInt([]) = (Nothing, [])
211parseInt(d:more)
212  | isDigit(d)    = let (i, remaining) = parseInt1(more, ord(d) - 48) in (Just i, remaining)
213  | otherwise     = (Nothing, d:more)
214 
Note: See TracBrowser for help on using the repository browser.