source: proto/RE2Pablo/ref/re2pbs/antlr/antlr_python_runtime-3.1.3/build/lib.linux-i686-2.7/antlr3/recognizers.py @ 2226

Last change on this file since 2226 was 2226, checked in by ksherdy, 7 years ago

Initial check in.

File size: 50.5 KB
Line 
1"""ANTLR3 runtime package"""
2
3# begin[licence]
4#
5# [The "BSD licence"]
6# Copyright (c) 2005-2008 Terence Parr
7# All rights reserved.
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions
11# are met:
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17# 3. The name of the author may not be used to endorse or promote products
18#    derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30#
31# end[licence]
32
33import sys
34import inspect
35
36from antlr3 import runtime_version, runtime_version_str
37from antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \
38     EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE
39from antlr3.exceptions import RecognitionException, MismatchedTokenException, \
40     MismatchedRangeException, MismatchedTreeNodeException, \
41     NoViableAltException, EarlyExitException, MismatchedSetException, \
42     MismatchedNotSetException, FailedPredicateException, \
43     BacktrackingFailed, UnwantedTokenException, MissingTokenException
44from antlr3.tokens import CommonToken, EOF_TOKEN, SKIP_TOKEN
45from antlr3.compat import set, frozenset, reversed
46
47
48class RecognizerSharedState(object):
49    """
50    The set of fields needed by an abstract recognizer to recognize input
51    and recover from errors etc...  As a separate state object, it can be
52    shared among multiple grammars; e.g., when one grammar imports another.
53
54    These fields are publically visible but the actual state pointer per
55    parser is protected.
56    """
57
58    def __init__(self):
59        # Track the set of token types that can follow any rule invocation.
60        # Stack grows upwards.
61        self.following = []
62
63        # This is true when we see an error and before having successfully
64        # matched a token.  Prevents generation of more than one error message
65        # per error.
66        self.errorRecovery = False
67
68        # The index into the input stream where the last error occurred.
69        # This is used to prevent infinite loops where an error is found
70        # but no token is consumed during recovery...another error is found,
71        # ad naseum.  This is a failsafe mechanism to guarantee that at least
72        # one token/tree node is consumed for two errors.
73        self.lastErrorIndex = -1
74
75        # If 0, no backtracking is going on.  Safe to exec actions etc...
76        # If >0 then it's the level of backtracking.
77        self.backtracking = 0
78
79        # An array[size num rules] of Map<Integer,Integer> that tracks
80        # the stop token index for each rule.  ruleMemo[ruleIndex] is
81        # the memoization table for ruleIndex.  For key ruleStartIndex, you
82        # get back the stop token for associated rule or MEMO_RULE_FAILED.
83        #
84        # This is only used if rule memoization is on (which it is by default).
85        self.ruleMemo = None
86
87        ## Did the recognizer encounter a syntax error?  Track how many.
88        self.syntaxErrors = 0
89
90
91        # LEXER FIELDS (must be in same state object to avoid casting
92        # constantly in generated code and Lexer object) :(
93
94
95        ## The goal of all lexer rules/methods is to create a token object.
96        # This is an instance variable as multiple rules may collaborate to
97        # create a single token.  nextToken will return this object after
98        # matching lexer rule(s).  If you subclass to allow multiple token
99        # emissions, then set this to the last token to be matched or
100        # something nonnull so that the auto token emit mechanism will not
101        # emit another token.
102        self.token = None
103
104        ## What character index in the stream did the current token start at?
105        # Needed, for example, to get the text for current token.  Set at
106        # the start of nextToken.
107        self.tokenStartCharIndex = -1
108
109        ## The line on which the first character of the token resides
110        self.tokenStartLine = None
111
112        ## The character position of first character within the line
113        self.tokenStartCharPositionInLine = None
114
115        ## The channel number for the current token
116        self.channel = None
117
118        ## The token type for the current token
119        self.type = None
120
121        ## You can set the text for the current token to override what is in
122        # the input char buffer.  Use setText() or can set this instance var.
123        self.text = None
124       
125
126class BaseRecognizer(object):
127    """
128    @brief Common recognizer functionality.
129   
130    A generic recognizer that can handle recognizers generated from
131    lexer, parser, and tree grammars.  This is all the parsing
132    support code essentially; most of it is error recovery stuff and
133    backtracking.
134    """
135
136    MEMO_RULE_FAILED = -2
137    MEMO_RULE_UNKNOWN = -1
138
139    # copies from Token object for convenience in actions
140    DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
141
142    # for convenience in actions
143    HIDDEN = HIDDEN_CHANNEL
144
145    # overridden by generated subclasses
146    tokenNames = None
147
148    # The antlr_version attribute has been introduced in 3.1. If it is not
149    # overwritten in the generated recognizer, we assume a default of 3.0.1.
150    antlr_version = (3, 0, 1, 0)
151    antlr_version_str = "3.0.1"
152
153    def __init__(self, state=None):
154        # Input stream of the recognizer. Must be initialized by a subclass.
155        self.input = None
156
157        ## State of a lexer, parser, or tree parser are collected into a state
158        # object so the state can be shared.  This sharing is needed to
159        # have one grammar import others and share same error variables
160        # and other state variables.  It's a kind of explicit multiple
161        # inheritance via delegation of methods and shared state.
162        if state is None:
163            state = RecognizerSharedState()
164        self._state = state
165
166        if self.antlr_version > runtime_version:
167            raise RuntimeError(
168                "ANTLR version mismatch: "
169                "The recognizer has been generated by V%s, but this runtime "
170                "is V%s. Please use the V%s runtime or higher."
171                % (self.antlr_version_str,
172                   runtime_version_str,
173                   self.antlr_version_str))
174        elif (self.antlr_version < (3, 1, 0, 0) and
175              self.antlr_version != runtime_version):
176            # FIXME: make the runtime compatible with 3.0.1 codegen
177            # and remove this block.
178            raise RuntimeError(
179                "ANTLR version mismatch: "
180                "The recognizer has been generated by V%s, but this runtime "
181                "is V%s. Please use the V%s runtime."
182                % (self.antlr_version_str,
183                   runtime_version_str,
184                   self.antlr_version_str))
185
186    # this one only exists to shut up pylint :(
187    def setInput(self, input):
188        self.input = input
189
190       
191    def reset(self):
192        """
193        reset the parser's state; subclasses must rewinds the input stream
194        """
195       
196        # wack everything related to error recovery
197        if self._state is None:
198            # no shared state work to do
199            return
200       
201        self._state.following = []
202        self._state.errorRecovery = False
203        self._state.lastErrorIndex = -1
204        self._state.syntaxErrors = 0
205        # wack everything related to backtracking and memoization
206        self._state.backtracking = 0
207        if self._state.ruleMemo is not None:
208            self._state.ruleMemo = {}
209
210
211    def match(self, input, ttype, follow):
212        """
213        Match current input symbol against ttype.  Attempt
214        single token insertion or deletion error recovery.  If
215        that fails, throw MismatchedTokenException.
216
217        To turn off single token insertion or deletion error
218        recovery, override recoverFromMismatchedToken() and have it
219        throw an exception. See TreeParser.recoverFromMismatchedToken().
220        This way any error in a rule will cause an exception and
221        immediate exit from rule.  Rule would recover by resynchronizing
222        to the set of symbols that can follow rule ref.
223        """
224       
225        matchedSymbol = self.getCurrentInputSymbol(input)
226        if self.input.LA(1) == ttype:
227            self.input.consume()
228            self._state.errorRecovery = False
229            return matchedSymbol
230
231        if self._state.backtracking > 0:
232            # FIXME: need to return matchedSymbol here as well. damn!!
233            raise BacktrackingFailed
234
235        matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow)
236        return matchedSymbol
237
238
239    def matchAny(self, input):
240        """Match the wildcard: in a symbol"""
241
242        self._state.errorRecovery = False
243        self.input.consume()
244
245
246    def mismatchIsUnwantedToken(self, input, ttype):
247        return input.LA(2) == ttype
248
249
250    def mismatchIsMissingToken(self, input, follow):
251        if follow is None:
252            # we have no information about the follow; we can only consume
253            # a single token and hope for the best
254            return False
255       
256        # compute what can follow this grammar element reference
257        if EOR_TOKEN_TYPE in follow:
258            viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW()
259            follow = follow | viableTokensFollowingThisRule
260
261            if len(self._state.following) > 0:
262                # remove EOR if we're not the start symbol
263                follow = follow - set([EOR_TOKEN_TYPE])
264
265        # if current token is consistent with what could come after set
266        # then we know we're missing a token; error recovery is free to
267        # "insert" the missing token
268        if input.LA(1) in follow or EOR_TOKEN_TYPE in follow:
269            return True
270
271        return False
272
273
274    def reportError(self, e):
275        """Report a recognition problem.
276           
277        This method sets errorRecovery to indicate the parser is recovering
278        not parsing.  Once in recovery mode, no errors are generated.
279        To get out of recovery mode, the parser must successfully match
280        a token (after a resync).  So it will go:
281
282        1. error occurs
283        2. enter recovery mode, report error
284        3. consume until token found in resynch set
285        4. try to resume parsing
286        5. next match() will reset errorRecovery mode
287
288        If you override, make sure to update syntaxErrors if you care about
289        that.
290       
291        """
292       
293        # if we've already reported an error and have not matched a token
294        # yet successfully, don't report any errors.
295        if self._state.errorRecovery:
296            return
297
298        self._state.syntaxErrors += 1 # don't count spurious
299        self._state.errorRecovery = True
300
301        self.displayRecognitionError(self.tokenNames, e)
302
303
304    def displayRecognitionError(self, tokenNames, e):
305        hdr = self.getErrorHeader(e)
306        msg = self.getErrorMessage(e, tokenNames)
307        self.emitErrorMessage(hdr+" "+msg)
308
309
310    def getErrorMessage(self, e, tokenNames):
311        """
312        What error message should be generated for the various
313        exception types?
314       
315        Not very object-oriented code, but I like having all error message
316        generation within one method rather than spread among all of the
317        exception classes. This also makes it much easier for the exception
318        handling because the exception classes do not have to have pointers back
319        to this object to access utility routines and so on. Also, changing
320        the message for an exception type would be difficult because you
321        would have to subclassing exception, but then somehow get ANTLR
322        to make those kinds of exception objects instead of the default.
323        This looks weird, but trust me--it makes the most sense in terms
324        of flexibility.
325
326        For grammar debugging, you will want to override this to add
327        more information such as the stack frame with
328        getRuleInvocationStack(e, this.getClass().getName()) and,
329        for no viable alts, the decision description and state etc...
330
331        Override this to change the message generated for one or more
332        exception types.
333        """
334
335        if isinstance(e, UnwantedTokenException):
336            tokenName = "<unknown>"
337            if e.expecting == EOF:
338                tokenName = "EOF"
339
340            else:
341                tokenName = self.tokenNames[e.expecting]
342
343            msg = "extraneous input %s expecting %s" % (
344                self.getTokenErrorDisplay(e.getUnexpectedToken()),
345                tokenName
346                )
347
348        elif isinstance(e, MissingTokenException):
349            tokenName = "<unknown>"
350            if e.expecting == EOF:
351                tokenName = "EOF"
352
353            else:
354                tokenName = self.tokenNames[e.expecting]
355
356            msg = "missing %s at %s" % (
357                tokenName, self.getTokenErrorDisplay(e.token)
358                )
359
360        elif isinstance(e, MismatchedTokenException):
361            tokenName = "<unknown>"
362            if e.expecting == EOF:
363                tokenName = "EOF"
364            else:
365                tokenName = self.tokenNames[e.expecting]
366
367            msg = "mismatched input " \
368                  + self.getTokenErrorDisplay(e.token) \
369                  + " expecting " \
370                  + tokenName
371
372        elif isinstance(e, MismatchedTreeNodeException):
373            tokenName = "<unknown>"
374            if e.expecting == EOF:
375                tokenName = "EOF"
376            else:
377                tokenName = self.tokenNames[e.expecting]
378
379            msg = "mismatched tree node: %s expecting %s" \
380                  % (e.node, tokenName)
381
382        elif isinstance(e, NoViableAltException):
383            msg = "no viable alternative at input " \
384                  + self.getTokenErrorDisplay(e.token)
385
386        elif isinstance(e, EarlyExitException):
387            msg = "required (...)+ loop did not match anything at input " \
388                  + self.getTokenErrorDisplay(e.token)
389
390        elif isinstance(e, MismatchedSetException):
391            msg = "mismatched input " \
392                  + self.getTokenErrorDisplay(e.token) \
393                  + " expecting set " \
394                  + repr(e.expecting)
395
396        elif isinstance(e, MismatchedNotSetException):
397            msg = "mismatched input " \
398                  + self.getTokenErrorDisplay(e.token) \
399                  + " expecting set " \
400                  + repr(e.expecting)
401
402        elif isinstance(e, FailedPredicateException):
403            msg = "rule " \
404                  + e.ruleName \
405                  + " failed predicate: {" \
406                  + e.predicateText \
407                  + "}?"
408
409        else:
410            msg = str(e)
411
412        return msg
413   
414
415    def getNumberOfSyntaxErrors(self):
416        """
417        Get number of recognition errors (lexer, parser, tree parser).  Each
418        recognizer tracks its own number.  So parser and lexer each have
419        separate count.  Does not count the spurious errors found between
420        an error and next valid token match
421
422        See also reportError()
423        """
424        return self._state.syntaxErrors
425
426
427    def getErrorHeader(self, e):
428        """
429        What is the error header, normally line/character position information?
430        """
431       
432        return "line %d:%d" % (e.line, e.charPositionInLine)
433
434
435    def getTokenErrorDisplay(self, t):
436        """
437        How should a token be displayed in an error message? The default
438        is to display just the text, but during development you might
439        want to have a lot of information spit out.  Override in that case
440        to use t.toString() (which, for CommonToken, dumps everything about
441        the token). This is better than forcing you to override a method in
442        your token objects because you don't have to go modify your lexer
443        so that it creates a new Java type.
444        """
445       
446        s = t.text
447        if s is None:
448            if t.type == EOF:
449                s = "<EOF>"
450            else:
451                s = "<"+t.type+">"
452
453        return repr(s)
454   
455
456    def emitErrorMessage(self, msg):
457        """Override this method to change where error messages go"""
458        sys.stderr.write(msg + '\n')
459
460
461    def recover(self, input, re):
462        """
463        Recover from an error found on the input stream.  This is
464        for NoViableAlt and mismatched symbol exceptions.  If you enable
465        single token insertion and deletion, this will usually not
466        handle mismatched symbol exceptions but there could be a mismatched
467        token that the match() routine could not recover from.
468        """
469       
470        # PROBLEM? what if input stream is not the same as last time
471        # perhaps make lastErrorIndex a member of input
472        if self._state.lastErrorIndex == input.index():
473            # uh oh, another error at same token index; must be a case
474            # where LT(1) is in the recovery token set so nothing is
475            # consumed; consume a single token so at least to prevent
476            # an infinite loop; this is a failsafe.
477            input.consume()
478
479        self._state.lastErrorIndex = input.index()
480        followSet = self.computeErrorRecoverySet()
481       
482        self.beginResync()
483        self.consumeUntil(input, followSet)
484        self.endResync()
485
486
487    def beginResync(self):
488        """
489        A hook to listen in on the token consumption during error recovery.
490        The DebugParser subclasses this to fire events to the listenter.
491        """
492
493        pass
494
495
496    def endResync(self):
497        """
498        A hook to listen in on the token consumption during error recovery.
499        The DebugParser subclasses this to fire events to the listenter.
500        """
501
502        pass
503
504
505    def computeErrorRecoverySet(self):
506        """
507        Compute the error recovery set for the current rule.  During
508        rule invocation, the parser pushes the set of tokens that can
509        follow that rule reference on the stack; this amounts to
510        computing FIRST of what follows the rule reference in the
511        enclosing rule. This local follow set only includes tokens
512        from within the rule; i.e., the FIRST computation done by
513        ANTLR stops at the end of a rule.
514
515        EXAMPLE
516
517        When you find a "no viable alt exception", the input is not
518        consistent with any of the alternatives for rule r.  The best
519        thing to do is to consume tokens until you see something that
520        can legally follow a call to r *or* any rule that called r.
521        You don't want the exact set of viable next tokens because the
522        input might just be missing a token--you might consume the
523        rest of the input looking for one of the missing tokens.
524
525        Consider grammar:
526
527        a : '[' b ']'
528          | '(' b ')'
529          ;
530        b : c '^' INT ;
531        c : ID
532          | INT
533          ;
534
535        At each rule invocation, the set of tokens that could follow
536        that rule is pushed on a stack.  Here are the various "local"
537        follow sets:
538
539        FOLLOW(b1_in_a) = FIRST(']') = ']'
540        FOLLOW(b2_in_a) = FIRST(')') = ')'
541        FOLLOW(c_in_b) = FIRST('^') = '^'
542
543        Upon erroneous input "[]", the call chain is
544
545        a -> b -> c
546
547        and, hence, the follow context stack is:
548
549        depth  local follow set     after call to rule
550          0         \<EOF>                    a (from main())
551          1          ']'                     b
552          3          '^'                     c
553
554        Notice that ')' is not included, because b would have to have
555        been called from a different context in rule a for ')' to be
556        included.
557
558        For error recovery, we cannot consider FOLLOW(c)
559        (context-sensitive or otherwise).  We need the combined set of
560        all context-sensitive FOLLOW sets--the set of all tokens that
561        could follow any reference in the call chain.  We need to
562        resync to one of those tokens.  Note that FOLLOW(c)='^' and if
563        we resync'd to that token, we'd consume until EOF.  We need to
564        sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
565        In this case, for input "[]", LA(1) is in this set so we would
566        not consume anything and after printing an error rule c would
567        return normally.  It would not find the required '^' though.
568        At this point, it gets a mismatched token error and throws an
569        exception (since LA(1) is not in the viable following token
570        set).  The rule exception handler tries to recover, but finds
571        the same recovery set and doesn't consume anything.  Rule b
572        exits normally returning to rule a.  Now it finds the ']' (and
573        with the successful match exits errorRecovery mode).
574
575        So, you cna see that the parser walks up call chain looking
576        for the token that was a member of the recovery set.
577
578        Errors are not generated in errorRecovery mode.
579
580        ANTLR's error recovery mechanism is based upon original ideas:
581
582        "Algorithms + Data Structures = Programs" by Niklaus Wirth
583
584        and
585
586        "A note on error recovery in recursive descent parsers":
587        http://portal.acm.org/citation.cfm?id=947902.947905
588
589        Later, Josef Grosch had some good ideas:
590
591        "Efficient and Comfortable Error Recovery in Recursive Descent
592        Parsers":
593        ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
594
595        Like Grosch I implemented local FOLLOW sets that are combined
596        at run-time upon error to avoid overhead during parsing.
597        """
598       
599        return self.combineFollows(False)
600
601       
602    def computeContextSensitiveRuleFOLLOW(self):
603        """
604        Compute the context-sensitive FOLLOW set for current rule.
605        This is set of token types that can follow a specific rule
606        reference given a specific call chain.  You get the set of
607        viable tokens that can possibly come next (lookahead depth 1)
608        given the current call chain.  Contrast this with the
609        definition of plain FOLLOW for rule r:
610
611         FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
612
613        where x in T* and alpha, beta in V*; T is set of terminals and
614        V is the set of terminals and nonterminals.  In other words,
615        FOLLOW(r) is the set of all tokens that can possibly follow
616        references to r in *any* sentential form (context).  At
617        runtime, however, we know precisely which context applies as
618        we have the call chain.  We may compute the exact (rather
619        than covering superset) set of following tokens.
620
621        For example, consider grammar:
622
623        stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
624             | "return" expr '.'
625             ;
626        expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
627        atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
628             | '(' expr ')'
629             ;
630
631        The FOLLOW sets are all inclusive whereas context-sensitive
632        FOLLOW sets are precisely what could follow a rule reference.
633        For input input "i=(3);", here is the derivation:
634
635        stat => ID '=' expr ';'
636             => ID '=' atom ('+' atom)* ';'
637             => ID '=' '(' expr ')' ('+' atom)* ';'
638             => ID '=' '(' atom ')' ('+' atom)* ';'
639             => ID '=' '(' INT ')' ('+' atom)* ';'
640             => ID '=' '(' INT ')' ';'
641
642        At the "3" token, you'd have a call chain of
643
644          stat -> expr -> atom -> expr -> atom
645
646        What can follow that specific nested ref to atom?  Exactly ')'
647        as you can see by looking at the derivation of this specific
648        input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
649
650        You want the exact viable token set when recovering from a
651        token mismatch.  Upon token mismatch, if LA(1) is member of
652        the viable next token set, then you know there is most likely
653        a missing token in the input stream.  "Insert" one by just not
654        throwing an exception.
655        """
656
657        return self.combineFollows(True)
658
659
660    def combineFollows(self, exact):
661        followSet = set()
662        for idx, localFollowSet in reversed(list(enumerate(self._state.following))):
663            followSet |= localFollowSet
664            if exact:
665                # can we see end of rule?
666                if EOR_TOKEN_TYPE in localFollowSet:
667                    # Only leave EOR in set if at top (start rule); this lets
668                    # us know if have to include follow(start rule); i.e., EOF
669                    if idx > 0:
670                        followSet.remove(EOR_TOKEN_TYPE)
671                       
672                else:
673                    # can't see end of rule, quit
674                    break
675
676        return followSet
677
678
679    def recoverFromMismatchedToken(self, input, ttype, follow):
680        """Attempt to recover from a single missing or extra token.
681
682        EXTRA TOKEN
683
684        LA(1) is not what we are looking for.  If LA(2) has the right token,
685        however, then assume LA(1) is some extra spurious token.  Delete it
686        and LA(2) as if we were doing a normal match(), which advances the
687        input.
688
689        MISSING TOKEN
690
691        If current token is consistent with what could come after
692        ttype then it is ok to 'insert' the missing token, else throw
693        exception For example, Input 'i=(3;' is clearly missing the
694        ')'.  When the parser returns from the nested call to expr, it
695        will have call chain:
696
697          stat -> expr -> atom
698
699        and it will be trying to match the ')' at this point in the
700        derivation:
701
702             => ID '=' '(' INT ')' ('+' atom)* ';'
703                                ^
704        match() will see that ';' doesn't match ')' and report a
705        mismatched token error.  To recover, it sees that LA(1)==';'
706        is in the set of tokens that can follow the ')' token
707        reference in rule atom.  It can assume that you forgot the ')'.
708        """
709
710        e = None
711
712        # if next token is what we are looking for then "delete" this token
713        if self.mismatchIsUnwantedToken(input, ttype):
714            e = UnwantedTokenException(ttype, input)
715
716            self.beginResync()
717            input.consume() # simply delete extra token
718            self.endResync()
719
720            # report after consuming so AW sees the token in the exception
721            self.reportError(e)
722
723            # we want to return the token we're actually matching
724            matchedSymbol = self.getCurrentInputSymbol(input)
725
726            # move past ttype token as if all were ok
727            input.consume()
728            return matchedSymbol
729
730        # can't recover with single token deletion, try insertion
731        if self.mismatchIsMissingToken(input, follow):
732            inserted = self.getMissingSymbol(input, e, ttype, follow)
733            e = MissingTokenException(ttype, input, inserted)
734
735            # report after inserting so AW sees the token in the exception
736            self.reportError(e)
737            return inserted
738
739        # even that didn't work; must throw the exception
740        e = MismatchedTokenException(ttype, input)
741        raise e
742
743
744    def recoverFromMismatchedSet(self, input, e, follow):
745        """Not currently used"""
746
747        if self.mismatchIsMissingToken(input, follow):
748            self.reportError(e)
749            # we don't know how to conjure up a token for sets yet
750            return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow)
751
752        # TODO do single token deletion like above for Token mismatch
753        raise e
754
755
756    def getCurrentInputSymbol(self, input):
757        """
758        Match needs to return the current input symbol, which gets put
759        into the label for the associated token ref; e.g., x=ID.  Token
760        and tree parsers need to return different objects. Rather than test
761        for input stream type or change the IntStream interface, I use
762        a simple method to ask the recognizer to tell me what the current
763        input symbol is.
764
765        This is ignored for lexers.
766        """
767       
768        return None
769
770
771    def getMissingSymbol(self, input, e, expectedTokenType, follow):
772        """Conjure up a missing token during error recovery.
773
774        The recognizer attempts to recover from single missing
775        symbols. But, actions might refer to that missing symbol.
776        For example, x=ID {f($x);}. The action clearly assumes
777        that there has been an identifier matched previously and that
778        $x points at that token. If that token is missing, but
779        the next token in the stream is what we want we assume that
780        this token is missing and we keep going. Because we
781        have to return some token to replace the missing token,
782        we have to conjure one up. This method gives the user control
783        over the tokens returned for missing tokens. Mostly,
784        you will want to create something special for identifier
785        tokens. For literals such as '{' and ',', the default
786        action in the parser or tree parser works. It simply creates
787        a CommonToken of the appropriate type. The text will be the token.
788        If you change what tokens must be created by the lexer,
789        override this method to create the appropriate tokens.
790        """
791
792        return None
793
794
795##     def recoverFromMissingElement(self, input, e, follow):
796##         """
797##         This code is factored out from mismatched token and mismatched set
798##         recovery.  It handles "single token insertion" error recovery for
799##         both.  No tokens are consumed to recover from insertions.  Return
800##         true if recovery was possible else return false.
801##         """
802       
803##         if self.mismatchIsMissingToken(input, follow):
804##             self.reportError(e)
805##             return True
806
807##         # nothing to do; throw exception
808##         return False
809
810
811    def consumeUntil(self, input, tokenTypes):
812        """
813        Consume tokens until one matches the given token or token set
814
815        tokenTypes can be a single token type or a set of token types
816       
817        """
818       
819        if not isinstance(tokenTypes, (set, frozenset)):
820            tokenTypes = frozenset([tokenTypes])
821
822        ttype = input.LA(1)
823        while ttype != EOF and ttype not in tokenTypes:
824            input.consume()
825            ttype = input.LA(1)
826
827
828    def getRuleInvocationStack(self):
829        """
830        Return List<String> of the rules in your parser instance
831        leading up to a call to this method.  You could override if
832        you want more details such as the file/line info of where
833        in the parser java code a rule is invoked.
834
835        This is very useful for error messages and for context-sensitive
836        error recovery.
837
838        You must be careful, if you subclass a generated recognizers.
839        The default implementation will only search the module of self
840        for rules, but the subclass will not contain any rules.
841        You probably want to override this method to look like
842
843        def getRuleInvocationStack(self):
844            return self._getRuleInvocationStack(<class>.__module__)
845
846        where <class> is the class of the generated recognizer, e.g.
847        the superclass of self.
848        """
849
850        return self._getRuleInvocationStack(self.__module__)
851
852
853    def _getRuleInvocationStack(cls, module):
854        """
855        A more general version of getRuleInvocationStack where you can
856        pass in, for example, a RecognitionException to get it's rule
857        stack trace.  This routine is shared with all recognizers, hence,
858        static.
859
860        TODO: move to a utility class or something; weird having lexer call
861        this
862        """
863
864        # mmmhhh,... perhaps look at the first argument
865        # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of
866        # requested recognizer...
867       
868        rules = []
869        for frame in reversed(inspect.stack()):
870            code = frame[0].f_code
871            codeMod = inspect.getmodule(code)
872            if codeMod is None:
873                continue
874
875            # skip frames not in requested module
876            if codeMod.__name__ != module:
877                continue
878
879            # skip some unwanted names
880            if code.co_name in ('nextToken', '<module>'):
881                continue
882
883            rules.append(code.co_name)
884
885        return rules
886       
887    _getRuleInvocationStack = classmethod(_getRuleInvocationStack)
888   
889
890    def getBacktrackingLevel(self):
891        return self._state.backtracking
892
893    def setBacktrackingLevel(self, n):
894        self._state.backtracking = n
895
896
897    def failed(self):
898        """Return whether or not a backtracking attempt failed."""
899
900        return self._state.failed
901
902
903    def getGrammarFileName(self):
904        """For debugging and other purposes, might want the grammar name.
905       
906        Have ANTLR generate an implementation for this method.
907        """
908
909        return self.grammarFileName
910
911
912    def getSourceName(self):
913        raise NotImplementedError
914
915   
916    def toStrings(self, tokens):
917        """A convenience method for use most often with template rewrites.
918
919        Convert a List<Token> to List<String>
920        """
921
922        if tokens is None:
923            return None
924
925        return [token.text for token in tokens]
926
927
928    def getRuleMemoization(self, ruleIndex, ruleStartIndex):
929        """
930        Given a rule number and a start token index number, return
931        MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
932        start index.  If this rule has parsed input starting from the
933        start index before, then return where the rule stopped parsing.
934        It returns the index of the last token matched by the rule.
935        """
936       
937        if ruleIndex not in self._state.ruleMemo:
938            self._state.ruleMemo[ruleIndex] = {}
939
940        return self._state.ruleMemo[ruleIndex].get(
941            ruleStartIndex, self.MEMO_RULE_UNKNOWN
942            )
943
944
945    def alreadyParsedRule(self, input, ruleIndex):
946        """
947        Has this rule already parsed input at the current index in the
948        input stream?  Return the stop token index or MEMO_RULE_UNKNOWN.
949        If we attempted but failed to parse properly before, return
950        MEMO_RULE_FAILED.
951
952        This method has a side-effect: if we have seen this input for
953        this rule and successfully parsed before, then seek ahead to
954        1 past the stop token matched for this rule last time.
955        """
956
957        stopIndex = self.getRuleMemoization(ruleIndex, input.index())
958        if stopIndex == self.MEMO_RULE_UNKNOWN:
959            return False
960
961        if stopIndex == self.MEMO_RULE_FAILED:
962            raise BacktrackingFailed
963
964        else:
965            input.seek(stopIndex + 1)
966
967        return True
968
969
970    def memoize(self, input, ruleIndex, ruleStartIndex, success):
971        """
972        Record whether or not this rule parsed the input at this position
973        successfully.
974        """
975
976        if success:
977            stopTokenIndex = input.index() - 1
978        else:
979            stopTokenIndex = self.MEMO_RULE_FAILED
980       
981        if ruleIndex in self._state.ruleMemo:
982            self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex
983
984
985    def traceIn(self, ruleName, ruleIndex, inputSymbol):
986        sys.stdout.write("enter %s %s" % (ruleName, inputSymbol))
987       
988        if self._state.backtracking > 0:
989            sys.stdout.write(" backtracking=%s" % self._state.backtracking)
990
991        sys.stdout.write('\n')
992
993
994    def traceOut(self, ruleName, ruleIndex, inputSymbol):
995        sys.stdout.write("exit %s %s" % (ruleName, inputSymbol))
996       
997        if self._state.backtracking > 0:
998            sys.stdout.write(" backtracking=%s" % self._state.backtracking)
999
1000        if self._state.failed:
1001            sys.stdout.write(" failed")
1002        else:
1003            sys.stdout.write(" succeeded")
1004
1005        sys.stdout.write('\n')
1006
1007
1008class TokenSource(object):
1009    """
1010    @brief Abstract baseclass for token producers.
1011   
1012    A source of tokens must provide a sequence of tokens via nextToken()
1013    and also must reveal it's source of characters; CommonToken's text is
1014    computed from a CharStream; it only store indices into the char stream.
1015
1016    Errors from the lexer are never passed to the parser.  Either you want
1017    to keep going or you do not upon token recognition error.  If you do not
1018    want to continue lexing then you do not want to continue parsing.  Just
1019    throw an exception not under RecognitionException and Java will naturally
1020    toss you all the way out of the recognizers.  If you want to continue
1021    lexing then you should not throw an exception to the parser--it has already
1022    requested a token.  Keep lexing until you get a valid one.  Just report
1023    errors and keep going, looking for a valid token.
1024    """
1025   
1026    def nextToken(self):
1027        """Return a Token object from your input stream (usually a CharStream).
1028       
1029        Do not fail/return upon lexing error; keep chewing on the characters
1030        until you get a good one; errors are not passed through to the parser.
1031        """
1032
1033        raise NotImplementedError
1034   
1035
1036    def __iter__(self):
1037        """The TokenSource is an interator.
1038
1039        The iteration will not include the final EOF token, see also the note
1040        for the next() method.
1041
1042        """
1043       
1044        return self
1045
1046   
1047    def next(self):
1048        """Return next token or raise StopIteration.
1049
1050        Note that this will raise StopIteration when hitting the EOF token,
1051        so EOF will not be part of the iteration.
1052       
1053        """
1054
1055        token = self.nextToken()
1056        if token is None or token.type == EOF:
1057            raise StopIteration
1058        return token
1059
1060   
1061class Lexer(BaseRecognizer, TokenSource):
1062    """
1063    @brief Baseclass for generated lexer classes.
1064   
1065    A lexer is recognizer that draws input symbols from a character stream.
1066    lexer grammars result in a subclass of this object. A Lexer object
1067    uses simplified match() and error recovery mechanisms in the interest
1068    of speed.
1069    """
1070
1071    def __init__(self, input, state=None):
1072        BaseRecognizer.__init__(self, state)
1073        TokenSource.__init__(self)
1074       
1075        # Where is the lexer drawing characters from?
1076        self.input = input
1077
1078
1079    def reset(self):
1080        BaseRecognizer.reset(self) # reset all recognizer state variables
1081
1082        if self.input is not None:
1083            # rewind the input
1084            self.input.seek(0)
1085
1086        if self._state is None:
1087            # no shared state work to do
1088            return
1089       
1090        # wack Lexer state variables
1091        self._state.token = None
1092        self._state.type = INVALID_TOKEN_TYPE
1093        self._state.channel = DEFAULT_CHANNEL
1094        self._state.tokenStartCharIndex = -1
1095        self._state.tokenStartLine = -1
1096        self._state.tokenStartCharPositionInLine = -1
1097        self._state.text = None
1098
1099
1100    def nextToken(self):
1101        """
1102        Return a token from this source; i.e., match a token on the char
1103        stream.
1104        """
1105       
1106        while 1:
1107            self._state.token = None
1108            self._state.channel = DEFAULT_CHANNEL
1109            self._state.tokenStartCharIndex = self.input.index()
1110            self._state.tokenStartCharPositionInLine = self.input.charPositionInLine
1111            self._state.tokenStartLine = self.input.line
1112            self._state.text = None
1113            if self.input.LA(1) == EOF:
1114                return EOF_TOKEN
1115
1116            try:
1117                self.mTokens()
1118               
1119                if self._state.token is None:
1120                    self.emit()
1121                   
1122                elif self._state.token == SKIP_TOKEN:
1123                    continue
1124
1125                return self._state.token
1126
1127            except NoViableAltException, re:
1128                self.reportError(re)
1129                self.recover(re) # throw out current char and try again
1130
1131            except RecognitionException, re:
1132                self.reportError(re)
1133                # match() routine has already called recover()
1134
1135
1136    def skip(self):
1137        """
1138        Instruct the lexer to skip creating a token for current lexer rule
1139        and look for another token.  nextToken() knows to keep looking when
1140        a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
1141        if token==null at end of any token rule, it creates one for you
1142        and emits it.
1143        """
1144       
1145        self._state.token = SKIP_TOKEN
1146
1147
1148    def mTokens(self):
1149        """This is the lexer entry point that sets instance var 'token'"""
1150
1151        # abstract method
1152        raise NotImplementedError
1153   
1154
1155    def setCharStream(self, input):
1156        """Set the char stream and reset the lexer"""
1157        self.input = None
1158        self.reset()
1159        self.input = input
1160
1161
1162    def getSourceName(self):
1163        return self.input.getSourceName()
1164
1165
1166    def emit(self, token=None):
1167        """
1168        The standard method called to automatically emit a token at the
1169        outermost lexical rule.  The token object should point into the
1170        char buffer start..stop.  If there is a text override in 'text',
1171        use that to set the token's text.  Override this method to emit
1172        custom Token objects.
1173
1174        If you are building trees, then you should also override
1175        Parser or TreeParser.getMissingSymbol().
1176        """
1177
1178        if token is None:
1179            token = CommonToken(
1180                input=self.input,
1181                type=self._state.type,
1182                channel=self._state.channel,
1183                start=self._state.tokenStartCharIndex,
1184                stop=self.getCharIndex()-1
1185                )
1186            token.line = self._state.tokenStartLine
1187            token.text = self._state.text
1188            token.charPositionInLine = self._state.tokenStartCharPositionInLine
1189
1190        self._state.token = token
1191       
1192        return token
1193
1194
1195    def match(self, s):
1196        if isinstance(s, basestring):
1197            for c in s:
1198                if self.input.LA(1) != ord(c):
1199                    if self._state.backtracking > 0:
1200                        raise BacktrackingFailed
1201
1202                    mte = MismatchedTokenException(c, self.input)
1203                    self.recover(mte)
1204                    raise mte
1205
1206                self.input.consume()
1207
1208        else:
1209            if self.input.LA(1) != s:
1210                if self._state.backtracking > 0:
1211                    raise BacktrackingFailed
1212
1213                mte = MismatchedTokenException(unichr(s), self.input)
1214                self.recover(mte) # don't really recover; just consume in lexer
1215                raise mte
1216       
1217            self.input.consume()
1218           
1219
1220    def matchAny(self):
1221        self.input.consume()
1222
1223
1224    def matchRange(self, a, b):
1225        if self.input.LA(1) < a or self.input.LA(1) > b:
1226            if self._state.backtracking > 0:
1227                raise BacktrackingFailed
1228
1229            mre = MismatchedRangeException(unichr(a), unichr(b), self.input)
1230            self.recover(mre)
1231            raise mre
1232
1233        self.input.consume()
1234
1235
1236    def getLine(self):
1237        return self.input.line
1238
1239
1240    def getCharPositionInLine(self):
1241        return self.input.charPositionInLine
1242
1243
1244    def getCharIndex(self):
1245        """What is the index of the current character of lookahead?"""
1246       
1247        return self.input.index()
1248
1249
1250    def getText(self):
1251        """
1252        Return the text matched so far for the current token or any
1253        text override.
1254        """
1255        if self._state.text is not None:
1256            return self._state.text
1257       
1258        return self.input.substring(
1259            self._state.tokenStartCharIndex,
1260            self.getCharIndex()-1
1261            )
1262
1263
1264    def setText(self, text):
1265        """
1266        Set the complete text of this token; it wipes any previous
1267        changes to the text.
1268        """
1269        self._state.text = text
1270
1271
1272    text = property(getText, setText)
1273
1274
1275    def reportError(self, e):
1276        ## TODO: not thought about recovery in lexer yet.
1277
1278        ## # if we've already reported an error and have not matched a token
1279        ## # yet successfully, don't report any errors.
1280        ## if self.errorRecovery:
1281        ##     #System.err.print("[SPURIOUS] ");
1282        ##     return;
1283        ##
1284        ## self.errorRecovery = True
1285
1286        self.displayRecognitionError(self.tokenNames, e)
1287
1288
1289    def getErrorMessage(self, e, tokenNames):
1290        msg = None
1291       
1292        if isinstance(e, MismatchedTokenException):
1293            msg = "mismatched character " \
1294                  + self.getCharErrorDisplay(e.c) \
1295                  + " expecting " \
1296                  + self.getCharErrorDisplay(e.expecting)
1297
1298        elif isinstance(e, NoViableAltException):
1299            msg = "no viable alternative at character " \
1300                  + self.getCharErrorDisplay(e.c)
1301
1302        elif isinstance(e, EarlyExitException):
1303            msg = "required (...)+ loop did not match anything at character " \
1304                  + self.getCharErrorDisplay(e.c)
1305           
1306        elif isinstance(e, MismatchedNotSetException):
1307            msg = "mismatched character " \
1308                  + self.getCharErrorDisplay(e.c) \
1309                  + " expecting set " \
1310                  + repr(e.expecting)
1311
1312        elif isinstance(e, MismatchedSetException):
1313            msg = "mismatched character " \
1314                  + self.getCharErrorDisplay(e.c) \
1315                  + " expecting set " \
1316                  + repr(e.expecting)
1317
1318        elif isinstance(e, MismatchedRangeException):
1319            msg = "mismatched character " \
1320                  + self.getCharErrorDisplay(e.c) \
1321                  + " expecting set " \
1322                  + self.getCharErrorDisplay(e.a) \
1323                  + ".." \
1324                  + self.getCharErrorDisplay(e.b)
1325
1326        else:
1327            msg = BaseRecognizer.getErrorMessage(self, e, tokenNames)
1328
1329        return msg
1330
1331
1332    def getCharErrorDisplay(self, c):
1333        if c == EOF:
1334            c = '<EOF>'
1335        return repr(c)
1336
1337
1338    def recover(self, re):
1339        """
1340        Lexers can normally match any char in it's vocabulary after matching
1341        a token, so do the easy thing and just kill a character and hope
1342        it all works out.  You can instead use the rule invocation stack
1343        to do sophisticated error recovery if you are in a fragment rule.
1344        """
1345
1346        self.input.consume()
1347
1348
1349    def traceIn(self, ruleName, ruleIndex):
1350        inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
1351                                         self.getLine(),
1352                                         self.getCharPositionInLine()
1353                                         )
1354       
1355        BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol)
1356
1357
1358    def traceOut(self, ruleName, ruleIndex):
1359        inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
1360                                         self.getLine(),
1361                                         self.getCharPositionInLine()
1362                                         )
1363
1364        BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol)
1365
1366
1367
1368class Parser(BaseRecognizer):
1369    """
1370    @brief Baseclass for generated parser classes.
1371    """
1372   
1373    def __init__(self, lexer, state=None):
1374        BaseRecognizer.__init__(self, state)
1375
1376        self.setTokenStream(lexer)
1377
1378
1379    def reset(self):
1380        BaseRecognizer.reset(self) # reset all recognizer state variables
1381        if self.input is not None:
1382            self.input.seek(0) # rewind the input
1383
1384
1385    def getCurrentInputSymbol(self, input):
1386        return input.LT(1)
1387
1388
1389    def getMissingSymbol(self, input, e, expectedTokenType, follow):
1390        if expectedTokenType == EOF:
1391            tokenText = "<missing EOF>"
1392        else:
1393            tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">"
1394        t = CommonToken(type=expectedTokenType, text=tokenText)
1395        current = input.LT(1)
1396        if current.type == EOF:
1397            current = input.LT(-1)
1398
1399        if current is not None:
1400            t.line = current.line
1401            t.charPositionInLine = current.charPositionInLine
1402        t.channel = DEFAULT_CHANNEL
1403        return t
1404
1405
1406    def setTokenStream(self, input):
1407        """Set the token stream and reset the parser"""
1408       
1409        self.input = None
1410        self.reset()
1411        self.input = input
1412
1413
1414    def getTokenStream(self):
1415        return self.input
1416
1417
1418    def getSourceName(self):
1419        return self.input.getSourceName()
1420
1421
1422    def traceIn(self, ruleName, ruleIndex):
1423        BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1))
1424
1425
1426    def traceOut(self, ruleName, ruleIndex):
1427        BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1))
1428
1429
1430class RuleReturnScope(object):
1431    """
1432    Rules can return start/stop info as well as possible trees and templates.
1433    """
1434
1435    def getStart(self):
1436        """Return the start token or tree."""
1437        return None
1438   
1439
1440    def getStop(self):
1441        """Return the stop token or tree."""
1442        return None
1443
1444   
1445    def getTree(self):
1446        """Has a value potentially if output=AST."""
1447        return None
1448
1449
1450    def getTemplate(self):
1451        """Has a value potentially if output=template."""
1452        return None
1453
1454
1455class ParserRuleReturnScope(RuleReturnScope):
1456    """
1457    Rules that return more than a single value must return an object
1458    containing all the values.  Besides the properties defined in
1459    RuleLabelScope.predefinedRulePropertiesScope there may be user-defined
1460    return values.  This class simply defines the minimum properties that
1461    are always defined and methods to access the others that might be
1462    available depending on output option such as template and tree.
1463
1464    Note text is not an actual property of the return value, it is computed
1465    from start and stop using the input stream's toString() method.  I
1466    could add a ctor to this so that we can pass in and store the input
1467    stream, but I'm not sure we want to do that.  It would seem to be undefined
1468    to get the .text property anyway if the rule matches tokens from multiple
1469    input streams.
1470
1471    I do not use getters for fields of objects that are used simply to
1472    group values such as this aggregate.  The getters/setters are there to
1473    satisfy the superclass interface.
1474    """
1475
1476    def __init__(self):
1477        self.start = None
1478        self.stop = None
1479
1480   
1481    def getStart(self):
1482        return self.start
1483
1484
1485    def getStop(self):
1486        return self.stop
1487
Note: See TracBrowser for help on using the repository browser.