source: proto/REgen/abnf2re.py @ 1495

Last change on this file since 1495 was 892, checked in by cameron, 9 years ago

REgen: regexp generator for ABNF

File size: 6.5 KB
Line 
1from REbuild import *
2#
3#  abnf2re - generate regular expressions from ABNF
4#
5#  Robert D. Cameron, May 5, 2003
6#
7#
8#-----------------------------------------------------------
9#
10# ABNF parsing equations - autogenerated by ABNF-bootstrap.py
11#                          then by abnf2re abnf.abnf
12#   Except:
13#     - order of alternatives within repeat have been reversed.
14#     - CRLF changed to \n
15#
16prose_val = "<[ -=?-~]*>"
17ALPHA = "[A-Za-z]"
18BIT = "[01]"
19CHAR = "[\x01-\x7f]"
20CR = "\x0d"
21CTL = "[\x00-\x1f\x7f]"
22DIGIT = "[0-9]"
23DQUOTE = "\""
24HEXDIG = "[0-9A-Fa-f]"
25HTAB = "\x09"
26LF = "\x0a"
27OCTET = "[\x00-\xff]"
28SP = " "
29VCHAR = "[!-~]"
30WSP = "[\x09 ]"
31rulename = ALPHA + "[\-0-9A-Za-z]*"
32#repeat = DIGIT + "+|(?:" + DIGIT + "*\*" + DIGIT + "*)"
33repeat = DIGIT + "*\*" + DIGIT + "*|" + DIGIT + "+"
34char_val = DQUOTE + "[ !#-~]*" + DQUOTE
35bin_val = "[bB]" + BIT + "+(?:(?:\." + BIT + "+)+|(?:-" + BIT + "+))?"
36dec_val = "[dD]" + DIGIT + "+(?:(?:\." + DIGIT + "+)+|(?:-" + DIGIT + "+))?"
37hex_val = "[xX]" + HEXDIG + "+(?:(?:\." + HEXDIG + "+)+|(?:-" + HEXDIG + "+))?"
38#CRLF = CR + LF
39CRLF = '\n'
40comment = ";[\x09 -~]*" + CRLF
41num_val = "%(?:" + bin_val + "|" + dec_val + "|" + hex_val + ")"
42LWSP = "(?:[\x09 ]|" + CRLF + WSP + ")*"
43c_nl = comment + "|" + CRLF
44c_wsp = "[\x09 ]|(?:(?:" + c_nl + ")" + WSP + ")"
45defined_as = "(?:" + c_wsp + ")*(?:=|=/)(?:" + c_wsp + ")*"
46#
47#-----------------------------------------------------------
48
49
50class ParseFailure(Exception):
51    def __init__(self, str):
52        self.str = str
53    def __str__(self):
54        return self.str
55
56def parse_by_RE(regexp, str):
57    m = regexp.match(str)
58    if m == None: raise ParseFailure(str)
59    return (str[m.start(0):m.end(0)], str[m.end(0):])
60
61def charCode(codeStr, base):
62    if base == 10: return chr(int(codeStr))
63    elif base == 2:
64        return chr(reduce(lambda accum, bit: 2 * accum + int(bit), codeStr, 0))
65    elif base == 16:
66        if len(codeStr) % 2 == 1: codeStr = '0' + codeStr
67        return binascii.unhexlify(codeStr)
68
69def processNumVal(elem):
70    if elem[1] == 'd': base = 10
71    elif elem[1] == 'b': base = 2
72    else: base = 16
73    bounds = string.split(elem[2:], '-')
74    if len(bounds) == 2:
75        return CharRange(charCode(bounds[0], base), charCode(bounds[1], base))
76    else:
77        digits = string.split(bounds[0], '.')
78        return Literal(string.join(map(lambda d: charCode(d, base), digits)))
79       
80   
81   
82simple_element = rulename + '|' + char_val  + '|' + num_val + '|' + prose_val
83def parseElement(s):
84    if s[0] == '(': return parseGroup(s)
85    elif s[0] == '[': return parseOption(s)
86    else:
87        elem, afterelem = parse_by_RE(re.compile(simple_element), s)
88        if elem[0] == '"': return (anycase_Literal(elem[1:-1]), afterelem)
89        elif elem[0] == '<': return (Literal(elem), afterelem)
90        elif elem[0] == '%': return (processNumVal(elem), afterelem)
91        else: return (Var(elem), afterelem)
92
93group_start = '\((?:' + c_wsp + ')*'
94group_end = '(?:' + c_wsp + ')*\)'
95
96def parseGroup(s):
97    start, after_start = parse_by_RE(re.compile(group_start), s)
98    inner, after_inner = parseAlternation(after_start)
99    end, after_end = parse_by_RE(re.compile(group_end), after_inner)
100    return (Group(inner), after_end)
101
102option_start = '\[(?:' + c_wsp + ')*'
103option_end = '(?:' + c_wsp + ')*\]'
104
105def parseOption(s):
106    start, after_start = parse_by_RE(re.compile(option_start), s)
107    inner, after_inner = parseAlternation(after_start)
108    end, after_end = parse_by_RE(re.compile(option_end), after_inner)
109    return (Opt(inner), after_end)
110
111alternation_sep = re.compile('(?:' + c_wsp + ')*/(?:' + c_wsp + ')*')
112def parseAlternation(s):
113    item1, after_item = parseConcatenation(s)
114    alts = [item1]
115    while True:
116        try:
117            sep, after_sep = parse_by_RE(alternation_sep, after_item)
118        except ParseFailure:
119            if len(alts) > 1:
120#                print gen(AltList(alts), [], pySyntax)
121                return (AltList(alts), after_item)
122            else:
123                return (alts[0], after_item)
124        alt, after_item = parseConcatenation(after_sep)
125        alts.append(alt)
126   
127
128concatenation_sep = re.compile('(?:' + c_wsp + ')+')
129def parseConcatenation(s):
130    item1, after_item = parseRepetition(s)
131    items = [item1]
132    while True:
133        try:
134            sep, after_sep = parse_by_RE(concatenation_sep, after_item)
135            item, after_item = parseRepetition(after_sep)
136            items.append(item)
137        except ParseFailure:
138            if len(items) > 1:
139                return (JoinList(items), after_item)
140            else: return (item1, after_item)
141           
142def parseRepetition(s):
143    try:
144        rep, after_rep = parse_by_RE(re.compile(repeat), s)
145    except ParseFailure: return parseElement(s)
146    elem, afterElem = parseElement(after_rep)
147    bounds = string.split(rep, '*')
148    if bounds[0] == '': lower = 0
149    else: lower = int(bounds[0])
150    if len(bounds) == 1: return (Repeat(elem, lower, lower), afterElem)
151    if bounds[1] == '':
152        if lower == 0: return (Star(elem), afterElem)
153        elif lower == 1: return (Plus(elem), afterElem)
154        else: return (Repeat(elem, lower, -1), afterElem)
155    else: return (Repeat(elem, lower, int(bounds[1])), afterElem)
156
157
158def parseRule(s):
159    name, after_name = parse_by_RE(re.compile(rulename), s)
160    defn, after_defn = parse_by_RE(re.compile(defined_as), after_name)
161    body, after_body = parseAlternation(after_defn)
162    finish, after_rule = parse_by_RE(re.compile('(?:' + c_wsp + ')*' + c_nl),
163                                     after_body)
164    return (Rule(Var(name), body), after_rule)
165
166rule_sep = re.compile('(?:' + c_wsp + '*' + c_nl + ')*')
167def parseGrammar(s):
168    rules = []
169    after_rule = s
170    while True:
171        try:
172            rule, after_rule = parseRule(after_rule)
173            rules.append(rule)
174            sep, after_rule = parse_by_RE(rule_sep, after_rule)
175        except ParseFailure:
176            if after_rule != '': print after_rule
177            return rules
178       
179#
180#
181#  Main code.
182#
183
184def main():
185    import sys
186    infile, outfile = sys.stdin, sys.stdout   # defaults if no arguments
187    if len(sys.argv) >= 3: outfile = open(sys.argv[2],'w')
188    if len(sys.argv) >= 2: infile = open(sys.argv[1])
189    theGrammar = simplify_grammar(parseGrammar(infile.read()))
190    outfile.write(grammar_gen(regular_subgrammar(theGrammar), pySyntax))
191    outfile.write('------------------------------------------\n')
192    outfile.write(grammar_gen(recursive_subgrammar(theGrammar), pySyntax))
193    outfile.close()
194
195if __name__ == "__main__": main()
Note: See TracBrowser for help on using the repository browser.