# # Recursive Parenthesis Matching - PDF style with EOL Comments # # Robert D. Cameron # April 17, 2013 # # This version ignores parentheses within EOL Comments that # start with a % character. However, % characters within # strings are ordinary data characters. # # import sys import pablo class Basis_bits(): bit_0 = 0 bit_1 = 0 bit_2 = 0 bit_3 = 0 bit_4 = 0 bit_5 = 0 bit_6 = 0 bit_7 = 0 class Lex (): LParen = 0 RParen = 0 Percent = 0 EOL = 0 class Matches() : closed = 0 instring = 0 error = 0 comment_start = 0 comment_end = 0 def Classify_bytes(basis_bits, lex): temp1 = (basis_bits.bit_0 | basis_bits.bit_1) temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3) temp3 = (temp2 &~ temp1) temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5) temp5 = (basis_bits.bit_6 | basis_bits.bit_7) temp6 = (temp4 &~ temp5) lex.LParen = (temp3 & temp6) temp7 = (basis_bits.bit_7 &~ basis_bits.bit_6) temp8 = (temp4 & temp7) lex.RParen = (temp3 & temp8) temp9 = (basis_bits.bit_5 &~ basis_bits.bit_4) temp10 = (temp9 & temp7) lex.Percent = (temp3 & temp10) temp11 = (basis_bits.bit_2 | basis_bits.bit_3) temp12 = (temp1 | temp11) temp13 = (basis_bits.bit_6 &~ basis_bits.bit_7) temp14 = (temp4 & temp13) lex.EOL = (temp14 &~ temp12) # # Modified version with comment processing # # Let pending_LParen be left parentheses for which we have a pending # obligation to find the corresponding right parentheses. # # Let pending_Pct be Percent marks that have not yet been # ruled out as comment opening delimiters # # def Match_Parens_With_Comments(lex, matches): matches.instring = 0 matches.closed = 0 matches.error = 0 line_starts = ~pablo.Advance(~lex.EOL) line_ends1 = pablo.ScanTo(line_starts, lex.EOL | lex.Percent) pending_Pct = line_ends1 & lex.Percent known_outside_Ct = pablo.SpanUpTo(line_starts, line_ends1) pending_LParen = known_outside_Ct & lex.LParen print "pending_LParen0" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110) unmatched_RParen = lex.RParen inPlay = pending_LParen | unmatched_RParen | pending_Pct while pending_LParen: # Scan from pending ( marks to next [()%]. Everything we find # must be within a string. pscan = pablo.AdvanceThenScanTo(pending_LParen, inPlay) matches.instring |= pablo.ExclusiveSpan(pending_LParen, pscan) | pscan &~ lex.RParen matches.closed |= pscan & lex.RParen print "matches.closed" + " "*(16-14) + pablo.bitstream2string(matches.closed,110) matches.error |= pablo.atEOF(pscan) pending_LParen = pscan & lex.LParen print "pending_LParen1" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110) # Did we scan into a pending comment region? pct_found = pscan & pending_Pct if pct_found: # The scan from the "(" was terminated prematurely by the "%" mark. # We include this position in pending_LParen so that the scan can # continue next time around. pending_LParen |= pct_found print "pct_found:" + " "*(16-10) + pablo.bitstream2string(pct_found,110) print "pending_LParen2" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110) # Clear this % position as a line terminator, and find the next. # Determine the region that was previously identified as potentially # inside a comment and mark it as outside. line_ends1 &= ~pct_found line_ends1 |= pablo.AdvanceThenScanTo(pct_found, lex.EOL | lex.Percent) # Add any new potential comment delimiter to the pending ones. pending_Pct |= line_ends1 & lex.Percent newly_outside = pablo.SpanUpTo(pct_found, line_ends1) known_outside_Ct |= newly_outside # If any LParens have been revealed, add scan obligations for them pending_LParen |= newly_outside & lex.LParen print "pending_LParen3" + " "*(16-15) + pablo.bitstream2string(pending_LParen,110) unmatched_RParen = lex.RParen &~ matches.closed inPlay = pending_LParen | unmatched_RParen | pending_Pct # # No more scans to do. Any pending pct marks are now known # as definite comment delimiters. matches.comment_start = line_ends1 & lex.Percent matches.comment_end = pablo.ScanTo(matches.comment_start, lex.EOL) # # Any closing paren that was not actually used to close # an opener is in error. matches.error |= lex.RParen &~ matches.closed &~ pablo.SpanUpTo(matches.comment_start, matches.comment_end) basis_bits = Basis_bits() lex = Lex() matches = Matches() if __name__ == "__main__": #print "Starting ..." if len(sys.argv) > 1: u8data = pablo.readfile(sys.argv[1]) pablo.EOF_mask = pablo.transpose_streams(u8data, basis_bits) Classify_bytes(basis_bits, lex) Match_Parens_With_Comments(lex, matches) lgth = len(u8data) print "data:" + " "*(16-5) + u8data print "errors:" + " "*(16-7) + pablo.bitstream2string(matches.error, lgth+1) print "instring:" + " "*(16-9) + pablo.bitstream2string(matches.instring, lgth) print "comment_start:" + " "*(16-14) + pablo.bitstream2string(matches.comment_start, lgth) print "comment_end:" + " "*(16-12) + pablo.bitstream2string(matches.comment_end, lgth) else: print("Usage: python parenmatch.py ")