Ignore:
Timestamp:
Jun 12, 2010, 3:21:44 PM (9 years ago)
Author:
eamiri
Message:

backup to r407

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2.py

    r409 r411  
    1 # -*- coding: utf-8 -*-
    21#
    32# parabix2.py
     
    1918
    2019
    21 #import bitutil
     20import bitutil
    2221
    2322import byteclass
     
    2524import u8u16
    2625
    27 #import sys
     26import sys
     27
    2828
    2929def validate_xmlchar(u8, control, lex, EOF_mask):
     
    3838"""
    3939        EF_BF_pending = bitutil.Advance(bitutil.Advance(u8.xEF) & u8.xBF)
    40         ret = (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
    41         return ret
     40        return (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
    4241
    4342
     
    539538        illegal ]]>: _______________________________________1______________________1_______________
    540539"""
    541         ret = lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
    542         return ret
     540        return lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
    543541
    544542def demo_validate_no_CD_end(u8data):
     
    557555
    558556
    559 def main(u8data):
     557def parabix_parse(u8data):
    560558        # Transpose to parallel bit streams and prepare an EOF mask.
    561559        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     
    576574        # Compute XML multilterals such as <?, </, --, ]]>.
    577575        lex = add_multiliterals(lex)
    578        
    579        
    580         # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
     576
    581577        # Parse all comments, CDATA sections and processing instructions.
    582         #markup1 = parse_CtCDPI(lex, EOF_mask)
    583         CT_callouts = CtCDPI_callouts()
    584         PI_starts = 0
    585         PI_ends = 0
    586         Ct_starts = 0
    587         Ct_ends = 0
    588         CD_starts = 0
    589         CD_ends = 0
    590         CtCDPI_starts = 0
    591         # Scanning streams
    592         CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
    593         Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
    594         CD_end_scan = ~lex.CD_end & EOF_mask
    595         PI_end_scan = ~lex.PI_end & EOF_mask
    596         #
    597         # Initiate the scan
    598         CtCDPI_Cursor = 1
    599         CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
    600         CtCDPI_Cursor &= EOF_mask
    601         while CtCDPI_Cursor:
    602                 CtCDPI_starts |= CtCDPI_Cursor
    603                 PI_Cursor = CtCDPI_Cursor & lex.PI_start
    604                 CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
    605                 CD_Cursor = CD_Ct_Cursor & lex.LBracket
    606                 Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
    607                 PI_starts |= PI_Cursor
    608                 CD_starts |= CD_Cursor
    609                 Ct_starts |= Ct_Cursor
    610                 Ct_Cursor = bitutil.Advance(Ct_Cursor)
    611                 Ct_end_scan |= Ct_Cursor
    612                 PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
    613                 CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
    614                 Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
    615                 PI_ends |= PI_Cursor
    616                 CD_ends |= CD_Cursor
    617                 Ct_ends |= Ct_Cursor
    618                 CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
    619                 CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
    620                 CtCDPI_Cursor &= EOF_mask
    621 
    622         # End of loop: no remaining CtCDPI_Cursor
    623         CT_callouts.CD_span = CD_ends - CD_starts
    624         CT_callouts.Ct_span = Ct_ends - Ct_starts
    625         CT_callouts.PI_span = PI_ends - PI_starts
    626        
    627         CT_callouts.CtCDPI_mask |= bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
    628         CT_callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
    629        
    630         # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
    631         CT_callouts.error |= CT_callouts.CtCDPI_mask &~ EOF_mask
    632         ########## END OF MANUAL INLINING
    633        
    634         # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
     578        markup1 = parse_CtCDPI(lex, EOF_mask)
     579
    635580        # All remaining "<" must be tag start characters; parse tags.
    636         #tags = parse_tags(lex, CT_callouts.CtCDPI_mask, EOF_mask)
    637 
    638         #callouts = tag_callouts()
    639        
    640         # Delimiters for scans.
    641         DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
    642         SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
    643         AttListDelim = lex.Slash | lex.RAngle
    644        
    645         # Start the parallel parsing by inspecting the character
    646         # after the opening "<" of a tag.
    647         LAngleFollow = bitutil.Advance(lex.LAngle) &~ CtCDPI_mask
    648         ElemNamePositions = LAngleFollow & ~lex.Slash
    649         EndTagSeconds = LAngleFollow & lex.Slash
    650        
    651         # Start Tag/Empty Element Tag Parsing
    652 
    653         # Advance all cursors by scanning through the tag name.
    654         ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
    655         # Must have at least one name character for a legal start tag.
    656         # Mark any occurrences of null names as errors.
    657         ParseError = ElemNamePositions & ElemNameFollows
    658         callouts.ElemNames = ElemNameFollows - ElemNamePositions
    659        
    660         # Initialize the accumulators for attribute name and value positions.
    661         AttNameStarts = 0 
    662         AttNameFollows = 0
    663         EqToCheck = 0
    664         AttValStarts = 0
    665         AttValEnds = 0
    666         AttValFollows = 0
    667 
    668         # After the element name, there may or may not be an attlist.
    669         AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
    670         AttListEnd = AfterWS & AttListDelim
    671         AttNameStart = AfterWS & ~AttListDelim
    672         # At least one WS character is required between ElemNames and AttNames.
    673         ParseError |= ElemNameFollows & AttNameStart
    674 
    675         #
    676         # The following loop iterates through attributes within a start tag.
    677         # Because all start tags are processed in parallel, the number of
    678         # iterations is the maximum number of attributes found in any one
    679         # start tag, plus one.
    680         while AttNameStart:
    681                 AttNameStarts |= AttNameStart
    682                 AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
    683                 AttNameFollows |= AttNameFollow
    684                 # Scan through WS to the expected '=' delimiter.
    685                 EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
    686                 EqToCheck |= EqExpected
    687                 AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
    688                 AttValStarts |= AttValPos
    689                 DQuoteAttVal = AttValPos & lex.DQuote
    690                 SQuoteAttVal = AttValPos & lex.SQuote
    691                 DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
    692                 SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
    693                 AttValEnd = DQuoteAttEnd | SQuoteAttEnd
    694                 AttValEnds |= AttValEnd
    695                 AttValFollow = bitutil.Advance(AttValEnd)
    696                 AttValFollows |= AttValFollow
    697                 AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
    698                 AttListEnd |= AfterWS & AttListDelim
    699                 AttNameStart = AfterWS & ~AttListDelim
    700 
    701         # No more attribute values to process when AttNameStart == 0.
    702 
    703         callouts.AttNames = AttNameFollows - AttNameStarts
    704         callouts.AttVals = AttValFollows - AttValStarts
    705         STagEnds = AttListEnd & lex.RAngle
    706         # Mark any "/" characters found as the ends of empty element tags.
    707         callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
    708         callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
    709 
    710         # Check for errors.
    711         ParseError |= AttValFollows & AttNameStarts # No intervening WS.
    712         ParseError |= AttNameStarts & AttNameFollows # Null AttName
    713         ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
    714         ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
    715         ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
    716         ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
    717 
    718         # End Tag Parsing
    719         EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
    720         ParseError |= EndTagEnds & ~lex.RAngle
    721         callouts.EndTags = EndTagEnds - EndTagSeconds
    722         callouts.error = ParseError
    723         ########## END OF MANUAL INLINING
    724 
    725 
    726 
     581        tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
    727582
    728583        # All remaining "&" must be reference start characters; parse them.
    729         refs = parse_refs(lex, CT_callouts.CtCDPI_mask)
     584        refs = parse_refs(lex, markup1.CtCDPI_mask)
    730585
    731586        # Ensure that no occurrence of ]]> occurs outside of markup.
    732         CD_end_error = validate_no_CD_end(lex, CT_callouts, callouts)
     587        CD_end_error = validate_no_CD_end(lex, markup1, tags)
    733588
    734589        # Convert to UTF-16 bit streams.
     
    736591
    737592        # Consolidate and check for errors
    738         error = u8.error | xmlchar_error | CT_callouts.error | callouts.error | CD_end_error | refs.error
     593        error = u8.error | xmlchar_error | markup1.error | tags.error | CD_end_error | refs.error
    739594
    740595        # Consolidate the deletion_masks
    741         delmask = control.CRLF | refs.delmask | u16delmask # | CT_callouts.CDATA_delimiters
    742 
    743         return (CT_callouts, callouts, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
     596        delmask = control.CRLF | refs.delmask | u16delmask # | markup1.CDATA_delimiters
     597
     598        return (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
    744599
    745600
Note: See TracChangeset for help on using the changeset viewer.