source: proto/matchparens/pdfparenmatch2.py @ 5539

Last change on this file since 5539 was 3064, checked in by cameron, 6 years ago

Simplify parenthesis matching; fix paren match with comments.

File size: 5.0 KB
Line 
1#
2# Recursive Parenthesis Matching - PDF style with EOL Comments
3#
4# Robert D. Cameron
5# April 17, 2013
6#
7# This version ignores parentheses within EOL Comments that
8# start with a % character.   However, % characters within
9# strings are ordinary data characters.   
10#
11#
12import sys
13import pablo
14
15class Basis_bits():     
16        bit_0 = 0
17        bit_1 = 0
18        bit_2 = 0
19        bit_3 = 0
20        bit_4 = 0
21        bit_5 = 0
22        bit_6 = 0
23        bit_7 = 0
24
25class Lex ():
26        LParen = 0
27        RParen = 0
28        Percent = 0
29        EOL = 0
30       
31class Matches() :
32        closed = 0
33        instring = 0
34        error = 0
35        comment_start = 0
36        comment_end = 0
37
38
39def Classify_bytes(basis_bits, lex): 
40        temp1 = (basis_bits.bit_0 | basis_bits.bit_1)
41        temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3)
42        temp3 = (temp2 &~ temp1)
43        temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5)
44        temp5 = (basis_bits.bit_6 | basis_bits.bit_7)
45        temp6 = (temp4 &~ temp5)
46        lex.LParen = (temp3 & temp6)
47        temp7 = (basis_bits.bit_7 &~ basis_bits.bit_6)
48        temp8 = (temp4 & temp7)
49        lex.RParen = (temp3 & temp8)
50        temp9 = (basis_bits.bit_5 &~ basis_bits.bit_4)
51        temp10 = (temp9 & temp7)
52        lex.Percent = (temp3 & temp10)
53        temp11 = (basis_bits.bit_2 | basis_bits.bit_3)
54        temp12 = (temp1 | temp11)
55        temp13 = (basis_bits.bit_6 &~ basis_bits.bit_7)
56        temp14 = (temp4 & temp13)
57        lex.EOL = (temp14 &~ temp12)
58       
59#
60# Modified version with comment processing
61#
62# Let pending_LParen be left parentheses for which we have a pending
63#   obligation to find the corresponding right parentheses.
64#
65# Let pending_Pct be Percent marks that have not yet been
66#   ruled out as comment opening delimiters
67#
68#
69def Match_Parens_With_Comments(lex, matches):
70        matches.instring = 0
71        matches.closed = 0
72        matches.error = 0
73        line_starts = ~pablo.Advance(~lex.EOL)
74        line_ends1 = pablo.ScanTo(line_starts, lex.EOL | lex.Percent)
75        pending_Pct = line_ends1 & lex.Percent
76        known_outside_Ct = pablo.SpanUpTo(line_starts, line_ends1)
77        pending_LParen = known_outside_Ct & lex.LParen
78        print "pending_LParen0" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110)
79        unmatched_RParen = lex.RParen
80
81        inPlay = pending_LParen | unmatched_RParen | pending_Pct
82
83        while pending_LParen:
84
85                # Scan from pending ( marks to next [()%].   Everything we find
86                # must be within a string.
87                pscan = pablo.AdvanceThenScanTo(pending_LParen, inPlay)
88                matches.instring |= pablo.ExclusiveSpan(pending_LParen, pscan) | pscan &~ lex.RParen
89                matches.closed |= pscan & lex.RParen
90                print "matches.closed" + " "*(16-14) + pablo.bitstream2string(matches.closed,110)
91                matches.error |= pablo.atEOF(pscan)
92                pending_LParen = pscan & lex.LParen
93                print "pending_LParen1" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110)
94                # Did we scan into a pending comment region?
95                pct_found = pscan & pending_Pct
96               
97                if pct_found:
98                        # The scan from the "(" was terminated prematurely by the "%" mark.
99                        # We include this position in pending_LParen so that the scan can
100                        # continue next time around.
101                        pending_LParen |= pct_found
102                        print "pct_found:" + " "*(16-10) + pablo.bitstream2string(pct_found,110)
103                        print "pending_LParen2" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110)
104                        # Clear this % position as a line terminator, and find the next.
105                        # Determine the region that was previously identified as potentially
106                        # inside a comment and mark it as outside.
107                        line_ends1 &= ~pct_found
108                        line_ends1 |= pablo.AdvanceThenScanTo(pct_found, lex.EOL | lex.Percent)
109                        # Add any new potential comment delimiter to the pending ones.
110                        pending_Pct |= line_ends1 & lex.Percent
111                        newly_outside = pablo.SpanUpTo(pct_found, line_ends1)
112                        known_outside_Ct |= newly_outside
113                        # If any LParens have been revealed, add scan obligations for them
114                        pending_LParen |= newly_outside & lex.LParen
115                        print "pending_LParen3" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110)
116               
117                unmatched_RParen = lex.RParen &~ matches.closed                         
118                inPlay = pending_LParen | unmatched_RParen | pending_Pct
119        #
120        # No more scans to do.  Any pending pct marks are now known
121        # as definite comment delimiters.
122        matches.comment_start = line_ends1 & lex.Percent
123        matches.comment_end = pablo.ScanTo(matches.comment_start, lex.EOL)
124        #
125        # Any closing paren that was not actually used to close
126        # an opener is in error.
127        matches.error |= lex.RParen &~ matches.closed &~ pablo.SpanUpTo(matches.comment_start, matches.comment_end)
128
129basis_bits = Basis_bits()
130lex = Lex()
131matches = Matches()
132
133if __name__ == "__main__":
134        #print "Starting ..."
135        if len(sys.argv) > 1:
136                u8data = pablo.readfile(sys.argv[1]) 
137                pablo.EOF_mask = pablo.transpose_streams(u8data, basis_bits)
138                Classify_bytes(basis_bits, lex)
139                Match_Parens_With_Comments(lex, matches)
140                lgth = len(u8data)
141                print "data:" + " "*(16-5) + u8data
142                print "errors:" + " "*(16-7) + pablo.bitstream2string(matches.error, lgth+1)
143                print "instring:" + " "*(16-9) + pablo.bitstream2string(matches.instring, lgth)
144                print "comment_start:" + " "*(16-14) + pablo.bitstream2string(matches.comment_start, lgth)
145                print "comment_end:" + " "*(16-12) + pablo.bitstream2string(matches.comment_end, lgth)
146               
147        else:
148                print("Usage: python parenmatch.py <file>")
149       
150
151
Note: See TracBrowser for help on using the repository browser.