Changeset 409 for proto


Ignore:
Timestamp:
Jun 12, 2010, 3:09:17 PM (9 years ago)
Author:
eamiri
Message:

Compilable version of parabix2

Location:
proto/parabix2
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2.py

    r407 r409  
     1# -*- coding: utf-8 -*-
    12#
    23# parabix2.py
     
    1819
    1920
    20 import bitutil
     21#import bitutil
    2122
    2223import byteclass
     
    2425import u8u16
    2526
    26 import sys
     27#import sys
    2728
    2829def validate_xmlchar(u8, control, lex, EOF_mask):
     
    3738"""
    3839        EF_BF_pending = bitutil.Advance(bitutil.Advance(u8.xEF) & u8.xBF)
    39         return (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
     40        ret = (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
     41        return ret
    4042
    4143
     
    537539        illegal ]]>: _______________________________________1______________________1_______________
    538540"""
    539         return lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
     541        ret = lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
     542        return ret
    540543
    541544def demo_validate_no_CD_end(u8data):
     
    554557
    555558
    556 def parabix_parse(u8data):
     559def main(u8data):
    557560        # Transpose to parallel bit streams and prepare an EOF mask.
    558561        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     
    573576        # Compute XML multilterals such as <?, </, --, ]]>.
    574577        lex = add_multiliterals(lex)
    575 
     578       
     579       
     580        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
    576581        # Parse all comments, CDATA sections and processing instructions.
    577         markup1 = parse_CtCDPI(lex, EOF_mask)
    578 
     582        #markup1 = parse_CtCDPI(lex, EOF_mask)
     583        CT_callouts = CtCDPI_callouts()
     584        PI_starts = 0
     585        PI_ends = 0
     586        Ct_starts = 0
     587        Ct_ends = 0
     588        CD_starts = 0
     589        CD_ends = 0
     590        CtCDPI_starts = 0
     591        # Scanning streams
     592        CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
     593        Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
     594        CD_end_scan = ~lex.CD_end & EOF_mask
     595        PI_end_scan = ~lex.PI_end & EOF_mask
     596        #
     597        # Initiate the scan
     598        CtCDPI_Cursor = 1
     599        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
     600        CtCDPI_Cursor &= EOF_mask
     601        while CtCDPI_Cursor:
     602                CtCDPI_starts |= CtCDPI_Cursor
     603                PI_Cursor = CtCDPI_Cursor & lex.PI_start
     604                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
     605                CD_Cursor = CD_Ct_Cursor & lex.LBracket
     606                Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
     607                PI_starts |= PI_Cursor
     608                CD_starts |= CD_Cursor
     609                Ct_starts |= Ct_Cursor
     610                Ct_Cursor = bitutil.Advance(Ct_Cursor)
     611                Ct_end_scan |= Ct_Cursor
     612                PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
     613                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
     614                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
     615                PI_ends |= PI_Cursor
     616                CD_ends |= CD_Cursor
     617                Ct_ends |= Ct_Cursor
     618                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
     619                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
     620                CtCDPI_Cursor &= EOF_mask
     621
     622        # End of loop: no remaining CtCDPI_Cursor
     623        CT_callouts.CD_span = CD_ends - CD_starts
     624        CT_callouts.Ct_span = Ct_ends - Ct_starts
     625        CT_callouts.PI_span = PI_ends - PI_starts
     626       
     627        CT_callouts.CtCDPI_mask |= bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
     628        CT_callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
     629       
     630        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
     631        CT_callouts.error |= CT_callouts.CtCDPI_mask &~ EOF_mask
     632        ########## END OF MANUAL INLINING
     633       
     634        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
    579635        # All remaining "<" must be tag start characters; parse tags.
    580         tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
     636        #tags = parse_tags(lex, CT_callouts.CtCDPI_mask, EOF_mask)
     637
     638        #callouts = tag_callouts()
     639       
     640        # Delimiters for scans.
     641        DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
     642        SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
     643        AttListDelim = lex.Slash | lex.RAngle
     644       
     645        # Start the parallel parsing by inspecting the character
     646        # after the opening "<" of a tag.
     647        LAngleFollow = bitutil.Advance(lex.LAngle) &~ CtCDPI_mask
     648        ElemNamePositions = LAngleFollow & ~lex.Slash
     649        EndTagSeconds = LAngleFollow & lex.Slash
     650       
     651        # Start Tag/Empty Element Tag Parsing
     652
     653        # Advance all cursors by scanning through the tag name.
     654        ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
     655        # Must have at least one name character for a legal start tag.
     656        # Mark any occurrences of null names as errors.
     657        ParseError = ElemNamePositions & ElemNameFollows
     658        callouts.ElemNames = ElemNameFollows - ElemNamePositions
     659       
     660        # Initialize the accumulators for attribute name and value positions.
     661        AttNameStarts = 0 
     662        AttNameFollows = 0
     663        EqToCheck = 0
     664        AttValStarts = 0
     665        AttValEnds = 0
     666        AttValFollows = 0
     667
     668        # After the element name, there may or may not be an attlist.
     669        AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
     670        AttListEnd = AfterWS & AttListDelim
     671        AttNameStart = AfterWS & ~AttListDelim
     672        # At least one WS character is required between ElemNames and AttNames.
     673        ParseError |= ElemNameFollows & AttNameStart
     674
     675        #
     676        # The following loop iterates through attributes within a start tag.
     677        # Because all start tags are processed in parallel, the number of
     678        # iterations is the maximum number of attributes found in any one
     679        # start tag, plus one.
     680        while AttNameStart:
     681                AttNameStarts |= AttNameStart
     682                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
     683                AttNameFollows |= AttNameFollow
     684                # Scan through WS to the expected '=' delimiter.
     685                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
     686                EqToCheck |= EqExpected
     687                AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
     688                AttValStarts |= AttValPos
     689                DQuoteAttVal = AttValPos & lex.DQuote
     690                SQuoteAttVal = AttValPos & lex.SQuote
     691                DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
     692                SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
     693                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
     694                AttValEnds |= AttValEnd
     695                AttValFollow = bitutil.Advance(AttValEnd)
     696                AttValFollows |= AttValFollow
     697                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
     698                AttListEnd |= AfterWS & AttListDelim
     699                AttNameStart = AfterWS & ~AttListDelim
     700
     701        # No more attribute values to process when AttNameStart == 0.
     702
     703        callouts.AttNames = AttNameFollows - AttNameStarts
     704        callouts.AttVals = AttValFollows - AttValStarts
     705        STagEnds = AttListEnd & lex.RAngle
     706        # Mark any "/" characters found as the ends of empty element tags.
     707        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
     708        callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
     709
     710        # Check for errors.
     711        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
     712        ParseError |= AttNameStarts & AttNameFollows # Null AttName
     713        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
     714        ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
     715        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
     716        ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
     717
     718        # End Tag Parsing
     719        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
     720        ParseError |= EndTagEnds & ~lex.RAngle
     721        callouts.EndTags = EndTagEnds - EndTagSeconds
     722        callouts.error = ParseError
     723        ########## END OF MANUAL INLINING
     724
     725
     726
    581727
    582728        # All remaining "&" must be reference start characters; parse them.
    583         refs = parse_refs(lex, markup1.CtCDPI_mask)
     729        refs = parse_refs(lex, CT_callouts.CtCDPI_mask)
    584730
    585731        # Ensure that no occurrence of ]]> occurs outside of markup.
    586         CD_end_error = validate_no_CD_end(lex, markup1, tags)
     732        CD_end_error = validate_no_CD_end(lex, CT_callouts, callouts)
    587733
    588734        # Convert to UTF-16 bit streams.
     
    590736
    591737        # Consolidate and check for errors
    592         error = u8.error | xmlchar_error | markup1.error | tags.error | CD_end_error | refs.error
     738        error = u8.error | xmlchar_error | CT_callouts.error | callouts.error | CD_end_error | refs.error
    593739
    594740        # Consolidate the deletion_masks
    595         delmask = control.CRLF | refs.delmask | u16delmask # | markup1.CDATA_delimiters
    596 
    597         return (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
     741        delmask = control.CRLF | refs.delmask | u16delmask # | CT_callouts.CDATA_delimiters
     742
     743        return (CT_callouts, callouts, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
    598744
    599745
Note: See TracChangeset for help on using the changeset viewer.