source: proto/matchparens/pdfparenmatch2.py @ 3061

Last change on this file since 3061 was 3061, checked in by cameron, 6 years ago

PDF Parenthesis matching prototype with comments

File size: 3.9 KB
Line 
1#
2# Recursive Parenthesis Matching - PDF style with EOL Comments
3#
4# Robert D. Cameron
5# April 17, 2013
6#
7# This version ignores parentheses within EOL Comments that
8# start with a % character.   However, % characters within
9# strings are ordinary data characters.   
10#
11#
12import sys
13import pablo
14
15class Basis_bits():     
16        bit_0 = 0
17        bit_1 = 0
18        bit_2 = 0
19        bit_3 = 0
20        bit_4 = 0
21        bit_5 = 0
22        bit_6 = 0
23        bit_7 = 0
24
25class Lex ():
26        LParen = 0
27        RParen = 0
28        Pct = 0
29        LF = 0
30       
31class Matches() :
32        closed = 0
33        instring = 0
34        error = 0
35        comment_start = 0
36        comment_end = 0
37
38
39def Classify_bytes(basis_bits, lex): 
40        temp1 = (basis_bits.bit_0 | basis_bits.bit_1)
41        temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3)
42        temp3 = (temp2 &~ temp1)
43        temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5)
44        temp5 = (basis_bits.bit_6 | basis_bits.bit_7)
45        temp6 = (temp4 &~ temp5)
46        lex.LParen = (temp3 & temp6)
47        temp7 = (basis_bits.bit_7 &~ basis_bits.bit_6)
48        temp8 = (temp4 & temp7)
49        lex.RParen = (temp3 & temp8)
50        temp9 = (basis_bits.bit_5 &~ basis_bits.bit_4)
51        temp10 = (temp9 & temp7)
52        lex.Percent = (temp3 & temp10)
53        temp11 = (basis_bits.bit_2 | basis_bits.bit_3)
54        temp12 = (temp1 | temp11)
55        temp13 = (basis_bits.bit_6 &~ basis_bits.bit_7)
56        temp14 = (temp4 & temp13)
57        lex.EOL = (temp14 &~ temp12)
58       
59#
60# Modified version with comment processing
61#
62# Let pending_LParen be left parentheses for which we have a pending
63#   obligation to find the corresponding right parentheses.
64#
65# Let pending_Pct be Percent marks that have not yet been
66#   ruled out as comment opening delimiters
67def Match_Parens_With_Comments(lex, matches):
68        line_starts = ~pablo.Advance(~lex.EOL)
69        line_ends1 = pablo.ScanTo(line_starts, lex.EOL | lex.Percent)
70        pending_Pct = line_ends1 & lex.Percent
71        outside_Ct = pablo.SpanUpTo(line_starts, line_ends1)
72        pending_LParen = outside_Ct & lex.LParen
73        RParen_unmatched = lex.RParen
74
75        inPlay = pending_LParen | RParen_unmatched | pending_Pct
76
77        while pending_LParen:
78
79                # Scan from pending ( marks to next [()%].   Everything we find
80                # must be within a string.
81                pscan = pablo.AdvanceThenScanTo(pending_LParen, inPlay)
82                matches.instring |= pablo.ExclusiveSpan(pending_LParen, pscan) | pscan &~ lex.RParen
83                matches.closed |= pscan & lex.RParen
84                matches.error |= pablo.atEOF(pscan)
85                pending_LParen = pscan & ~lex.RParen
86                # Did we scan into a pending comment region?
87                pct_found = pscan & pending_Pct
88               
89                if pct_found:
90                        line_ends1 &= ~pct_found
91                        line_ends1 |= pablo.AdvanceThenScanTo(pct_found, lex.EOL | lex.Percent)
92                        pending_Pct = line_ends1 & lex.Percent
93                        new_outside = pablo.SpanUpTo(pct_found, line_ends1)
94                        pending_LParen |= new_outside & lex.LParen
95                        outside_Ct |= new_outside
96
97                RParen_unmatched = lex.RParen &~ matches.closed                         
98                inPlay = pending_LParen | RParen_unmatched | pending_Pct
99        #
100        # No more scans to do.  Any pending pct marks are now known
101        # as definite comment delimiters.
102        matches.comment_start = pending_Pct
103        matches.comment_end = pablo.ScanTo(pending_Pct, lex.EOL)
104        #
105        # Any closing paren that was not actually used to close
106        # an opener is in error.
107        matches.error |= lex.RParen &~ matches.closed &~ pablo.SpanUpTo(matches.comment_start, matches.comment_end)
108
109
110
111basis_bits = Basis_bits()
112lex = Lex()
113matches = Matches()
114
115if __name__ == "__main__":
116        #print "Starting ..."
117        if len(sys.argv) > 1:
118                u8data = pablo.readfile(sys.argv[1]) 
119                pablo.EOF_mask = pablo.transpose_streams(u8data, basis_bits)
120                Classify_bytes(basis_bits, lex)
121                Match_Parens_With_Comments(lex, matches)
122                lgth = len(u8data)
123                print "data:" + " "*(16-5) + u8data
124                print "errors:" + " "*(16-7) + pablo.bitstream2string(matches.error, lgth+1)
125                print "instring:" + " "*(16-9) + pablo.bitstream2string(matches.instring, lgth)
126                print "comment_start:" + " "*(16-14) + pablo.bitstream2string(matches.comment_start, lgth)
127                print "comment_end:" + " "*(16-12) + pablo.bitstream2string(matches.comment_end, lgth)
128               
129        else:
130                print("Usage: python parenmatch.py <file>")
131       
132
133
Note: See TracBrowser for help on using the repository browser.