Changeset 551 for proto


Ignore:
Timestamp:
Jul 29, 2010, 7:35:40 AM (9 years ago)
Author:
cameron
Message:

Conditional RBracket, cleanup

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2_compilable.py

    r550 r551  
    11# -*- coding: utf-8 -*-
    22#
    3 # parabix2.py
     3# parabix2_compilable.py
    44#
    55# Parallel XML Parsing with Bitstream Addition
    66# - Complete prototype for all bitstream computations in Parabix2
     7# - optimized for compilation
    78#
    89# Robert D. Cameron
    9 # August 20, 2009
     10# July 29, 2010
    1011#
    11 #----------------------------------------------------------------------------
    12 #
    13 # We use python's unlimited precision integers for unbounded bit streams.
    14 # This permits simple logical operations on the entire stream.
    15 # Assumption: bitstreams are little-endian (e.g., as on x86).
    16 #
    17 #----------------------------------------------------------------------------
    18 #
    19 
    2012
    2113#import bitutil
    22 
    23 #import byteclass
    24 
    25 #import u8u16
    2614
    2715#import sys
     
    196184
    197185
    198 def validate_xmlchar(u8, control, lex, EOF_mask):
    199         r"""Compute an error stream marking characters illegal in XML:
    200         (1) Control characters in the range 0x00-0x1F except HT, LF, CR
    201         (2) OxFFFF and OxFFFE, having UTF-8 encodings 0xEF 0xBF 0XBF and 0xEF 0xBF 0xBE.
    202 
    203         >>> demo_validate_xmlchar('plaintext (good: \x09) (bad: \x03) (bad \xEF\xBF\xBF) (good \xEF\xBF\xBC)')
    204         input high nybbles: 7666676772266663202226663202226662ebb22266662ebb2
    205         input low nybbles : 0c19e4584087ff4a09908214a039082140fff9087ff40ffc9
    206         illegal XML chars : __________________________1_________1_____________
    207 """
    208         EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
    209         ret = (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
    210         return ret
    211 
    212 
    213 def demo_validate_xmlchar(u8data):
    214         lgth = len(u8data)
    215         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    216         (u8, control, lex) = byteclass.classify_bytes(bit)
    217         bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
    218                             ('input low nybbles', bitutil.low_nybble_stream(u8data)),
    219                             ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex, EOF_mask), lgth+1))])
    220 
    221 def normalize_line_breaks(control, bit):
    222         r"""Convert CRs to LFs and mark CRLF occurrences for deletion.
    223 
    224         >>> demo_line_breaks('ab \r\n  cd \r  ef \r ')
    225         input high nybbles: 662002266202266202
    226         input low nybbles : 120da00340d00560d0
    227         CR                : ___1______1_____1_
    228         LF                : ____1_____________
    229         CRLF              : ____1_____________
    230 """
    231         control.CRLF = control.CR_scope & control.LF
    232         # Convert CRs to LFs (flip bits 5, 6 and 7 with xor).
    233         bit[5] ^= control.CR
    234         bit[6] ^= control.CR
    235         bit[7] ^= control.CR
    236         return (control, bit)
    237 
    238 def demo_line_breaks(u8data):
    239         lgth = len(u8data)
    240         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    241         (u8, control, lex) = byteclass.classify_bytes(bit)
    242         (control, bit) = normalize_line_breaks(control, bit)
    243         bitutil.print_aligned_u8_byte_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
    244                             ('input low nybbles', bitutil.low_nybble_stream(u8data)),
    245                             ('CR', bitutil.bitstream2string(control.CR, lgth)),
    246                             ('LF', bitutil.bitstream2string(control.LF, lgth)),
    247                             ('CRLF', bitutil.bitstream2string(control.CRLF, lgth))])
    248 
    249 
    250 
    251 
    252 
    253 def add_multiliterals(lex):
    254         """Extend the byte-based lexical item streams for some important
    255         multibyte literals.
    256        
    257         >>> demo_multiliterals("  <?php?>  <!--  -->  <![CDATA[  ]]> ")
    258         input data  :   <?php?>  <!--  -->  <![CDATA[  ]]>
    259         PI_start    : ___1_________________________________
    260         CtCD_start  : ____________1__________1_____________
    261         EndTag_start: _____________________________________
    262         CD_end      : ___________________________________1_
    263         DoubleHyphen: ______________1___1__________________
    264         PI_end      : ________1____________________________
    265         """
    266 
    267         lex.PI_start = lex.LAngle_scope & lex.QMark
    268         lex.CtCD_start = lex.LAngle_scope & lex.Exclam
    269         lex.CtCDPI_start = lex.PI_start | lex.CtCD_start
    270         lex.EndTag_start = lex.LAngle_scope & lex.Slash
    271         lex.CD_end = bitutil.Advance(lex.RBracket_scope & lex.RBracket) & lex.RAngle
    272         lex.DoubleHyphen = lex.Hyphen_scope & lex.Hyphen
    273         lex.PI_end = lex.QMark_scope & lex.RAngle
    274         return lex
    275 
    276 def demo_multiliterals(u8data):
    277         lgth = len(u8data)
    278         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    279         (u8, control, lex) = byteclass.classify_bytes(bit)
    280         lex = add_multiliterals(lex)
    281         bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    282                             ('PI_start', bitutil.bitstream2string(lex.PI_start, lgth)),
    283                             ('CtCD_start', bitutil.bitstream2string(lex.CtCD_start, lgth)),
    284                             ('EndTag_start', bitutil.bitstream2string(lex.EndTag_start, lgth)),
    285                             ('CD_end', bitutil.bitstream2string(lex.CD_end, lgth)),
    286                             ('DoubleHyphen', bitutil.bitstream2string(lex.DoubleHyphen, lgth)),
    287                             ('PI_end', bitutil.bitstream2string(lex.PI_end, lgth))])
    288 
    289186class CtCDPI_callouts:
    290187        CD_span = 0
     
    294191        error = 0
    295192       
    296 def parse_CtCDPI(lex, EOF_mask):
    297         """Parse all comments, CDATA sections and processing instructions.
    298        
    299         Return bitstreams marking the extent of these markup items,
    300         excluding initial and final bracketting.
    301        
    302         >>> demo_CtCDPI(' <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]> ')
    303         input data :  <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]>
    304         CD_span    : ______________________________11111111111111111111111__
    305         Ct_span    : _____________111111111111______________________________
    306         PI_span    : __11111________________________________________________
    307         CtCDPI_mask: __111111___111111111111111___1111111111111111111111111_
    308         error      : ________________________________________________________
    309        
    310         Comments are terminated by double-hyphen; immediately require closing ">".
    311        
    312         >>> demo_CtCDPI(' <!--  <?php?>  --   <!-- -->')
    313         input data :  <!--  <?php?>  --   <!-- -->
    314         CD_span    : _____________________________
    315         Ct_span    : ____11111111111111______1111_
    316         PI_span    : _____________________________
    317         CtCDPI_mask: __11111111111111111___1111111
    318         error      : __________________1___________
    319 
    320 
    321 
    322 """
    323         callouts = CtCDPI_callouts()
    324         PI_starts = 0
    325         PI_ends = 0
    326         Ct_starts = 0
    327         Ct_ends = 0
    328         CD_starts = 0
    329         CD_ends = 0
    330         CtCDPI_starts = 0
    331         # Scanning streams
    332         CtCDPI_scan = ~lex.CtCDPI_start & EOF_mask
    333         Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
    334         CD_end_scan = ~lex.CD_end & EOF_mask
    335         PI_end_scan = ~lex.PI_end & EOF_mask
    336         #
    337         # Initiate the scan
    338         CtCDPI_Cursor = 1
    339         CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
    340         CtCDPI_Cursor &= EOF_mask
    341         while CtCDPI_Cursor:
    342                 CtCDPI_starts |= CtCDPI_Cursor
    343                 PI_Cursor = CtCDPI_Cursor & lex.PI_start
    344                 CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
    345                 CD_Cursor = CD_Ct_Cursor & lex.LBracket
    346                 Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
    347                 PI_starts |= PI_Cursor
    348                 CD_starts |= CD_Cursor
    349                 Ct_starts |= Ct_Cursor
    350                 Ct_Cursor = bitutil.Advance(Ct_Cursor)
    351                 Ct_end_scan |= Ct_Cursor
    352                 PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
    353                 CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
    354                 Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
    355                 PI_ends |= PI_Cursor
    356                 CD_ends |= CD_Cursor
    357                 Ct_ends |= Ct_Cursor
    358                 CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
    359                 CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
    360                 CtCDPI_Cursor &= EOF_mask
    361         # End of loop: no remaining CtCDPI_Cursor
    362         callouts.CD_span = CD_ends - CD_starts
    363         callouts.Ct_span = Ct_ends - Ct_starts
    364         callouts.PI_span = PI_ends - PI_starts
    365        
    366         callouts.CtCDPI_mask |= bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
    367         callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
    368         # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
    369         callouts.error |= callouts.CtCDPI_mask &~ EOF_mask
    370         return callouts
    371 
    372 def demo_CtCDPI(u8data):
    373         lgth = len(u8data)
    374         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    375         (u8, control, lex) = byteclass.classify_bytes(bit)
    376         lex = add_multiliterals(lex)
    377         markup = parse_CtCDPI(lex, EOF_mask)
    378         bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    379                             ('CD_span', bitutil.bitstream2string(markup.CD_span, lgth)),
    380                             ('Ct_span', bitutil.bitstream2string(markup.Ct_span, lgth)),
    381                             ('PI_span', bitutil.bitstream2string(markup.PI_span, lgth)),
    382                             ('CtCDPI_mask', bitutil.bitstream2string(markup.CtCDPI_mask, lgth)),
    383                             ('error', bitutil.bitstream2string(markup.error, lgth+1))])
    384193
    385194
     
    390199        delmask = 0
    391200        error = 0
    392 
    393 def parse_refs(lex, CtCDPI_mask):
    394         """Parse and call out all general and character references.
    395         Mark all but the closing semicolon for deletion.
    396        
    397         >>> demo_refs(" &gt;  &#13;  &#x0a;  ")
    398         input data       :  &gt;  &#13;  &#x0a; 
    399         entity refs      : __11__________________
    400         decimal char refs: _________11___________
    401         hex char refs    : _________________11___
    402         ref delmask      : _111___1111___11111___
    403         errors           : _______________________
    404 
    405         Empty numeric references are reported as errors.
    406         >>> demo_refs(" &#;       &#x; ")
    407         input data       :  &#;       &#x;
    408         entity refs      : ________________
    409         decimal char refs: ________________
    410         hex char refs    : ________________
    411         ref delmask      : _11________111__
    412         errors           : ___1__________1__
    413 
    414         Improperly terminated or unterminated references (lacking ";") are also errors.
    415         >>> demo_refs("  &gt:  &#456a;  &#xab:  &unterminated")
    416         input data       :   &gt:  &#456a;  &#xab:  &unterminated
    417         entity refs      : ___111____________________111111111111
    418         decimal char refs: __________111_________________________
    419         hex char refs    : ____________________11________________
    420         ref delmask      : __1111__11111____11111___1111111111111
    421         errors           : ______1______1________1_______________1
    422 """
    423         CallOuts = ref_callouts()
    424         Ref2 = lex.RefStart_scope &~ CtCDPI_mask
    425         NumRef2 = Ref2 & lex.Hash
    426         GenRef2 = Ref2 &~ lex.Hash
    427         NumRef3 = bitutil.Advance(NumRef2)
    428         HexRef3 = NumRef3 & lex.x
    429         DecRef3 = NumRef3 &~ lex.x
    430         HexRef4 = bitutil.Advance(HexRef3)
    431         GenRefEnds = bitutil.ScanThru(GenRef2, lex.NameScan)
    432         DecRefEnds = bitutil.ScanThru(DecRef3, lex.Digit)
    433         HexRefEnds = bitutil.ScanThru(HexRef4, lex.Hex)
    434         # Error checks
    435         # At least one digit required for DecRef, one hex digit for HexRef.
    436         error1 = DecRef3 &~ lex.Digit
    437         error2 = HexRef4 &~ lex.Hex
    438         # Semicolon terminator required (also covers unterminated at EOF).
    439         error3 = (GenRefEnds | DecRefEnds | HexRefEnds) &~ lex.Semicolon
    440         CallOuts.GenRefs = GenRefEnds - GenRef2
    441         CallOuts.DecRefs = DecRefEnds - DecRef3
    442         CallOuts.HexRefs = HexRefEnds - HexRef4
    443         # Mark references for deletion, but leave the trailing semicolon as
    444         # the point for insertion of the "expansion" text (most often a
    445         # single character).
    446         CallOuts.delmask = (GenRefEnds | DecRefEnds | HexRefEnds) - lex.RefStart
    447         CallOuts.error = error1 | error2 | error3
    448         return CallOuts
    449 
    450 def demo_refs(u8data):
    451         lgth = len(u8data)
    452         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    453         (u8, control, lex) = byteclass.classify_bytes(bit)
    454         callouts = parse_refs(lex, 0)
    455         bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    456                             ('entity refs', bitutil.bitstream2string(callouts.GenRefs, lgth)),
    457                             ('decimal char refs', bitutil.bitstream2string(callouts.DecRefs, lgth)),
    458                             ('hex char refs', bitutil.bitstream2string(callouts.HexRefs, lgth)),
    459                             ('ref delmask', bitutil.bitstream2string(callouts.delmask, lgth)),
    460                             ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
    461201
    462202
     
    469209        EndTags = 0
    470210        error = 0
    471        
    472         # POTENTIAL ADDITIONAL FIELDS
    473         # StartTagEnds = 0
    474         # EmptyTagEnds = 0     
    475         # EndTagEnds = 0
    476 
    477 def parse_tags(lex, CtCDPI_mask, EOF_mask):
    478         """Parse start, empty and end tags, calling out element names, attribute
    479         names and values, empty tag positions, and tag extents.
    480 
    481         >>> demo_tags("<root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>")
    482         input data      : <root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>
    483         element names   : _1111__11___________11_______________________________1111__________________
    484         attribute names : _______________________11_______11________________________1111_____________
    485         attribute values: __________________________11111______11111_____________________111_________
    486         empty tag marks : ___________________________________________________________________1_______
    487         end tags        : _______________111______________________________111__________________11111_
    488         start/empty tags: _1111__11___________1111111111111111111111___________11111111111111________
    489         errors          : ____________________________________________________________________________
    490 
    491         Attributes can use double quotes.
    492 
    493         >>> demo_tags('<dquote_atts a1="1234" attribute2="4321"/>')
    494         input data      : <dquote_atts a1="1234" attribute2="4321"/>
    495         element names   : _11111111111______________________________
    496         attribute names : _____________11________1111111111_________
    497         attribute values: ________________111111____________111111__
    498         empty tag marks : _________________________________________1
    499         end tags        : __________________________________________
    500         start/empty tags: _1111111111111111111111111111111111111111_
    501         errors          : ___________________________________________
    502 
    503         Syntax errors of various types are identified with the error stream.
    504 
    505         1. Element name missing errors.
    506 
    507         >>> demo_tags("< noname='flawed'/> ")
    508         input data      : < noname='flawed'/>
    509         element names   : ____________________
    510         attribute names : __111111____________
    511         attribute values: _________11111111___
    512         empty tag marks : __________________1_
    513         end tags        : ____________________
    514         start/empty tags: _11111111111111111__
    515         errors          : _1___________________
    516 
    517         2. Missing attribute names.
    518 
    519         >>> demo_tags("<noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>")
    520         input data      : <noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>
    521         element names   : _11111_______________1111111___________________________________
    522         attribute names : _____________________________11________________________________
    523         attribute values: ________11111111________________111111___11111_________________
    524         empty tag marks : _________________1_____________________________________________
    525         end tags        : ______________________________________________________11111111_
    526         start/empty tags: _1111111111111111____1111111111111111111111111_________________
    527         errors          : _______1_______________________________1________________________
    528 
    529         3. Missing or incorrect = sign.
    530 
    531         >>> demo_tags('<errata plusforeq+"5678" noequals"90" />')
    532         input data      : <errata plusforeq+"5678" noequals"90" />
    533         element names   : _111111_________________________________
    534         attribute names : ________111111111________11111111_______
    535         attribute values: __________________111111__________111111
    536         empty tag marks : ________________________________________
    537         end tags        : ________________________________________
    538         start/empty tags: _111111111111111111111111111111111111111
    539         errors          : _________________1_______________11______
    540 
    541         4.  Missing whitespace
    542 
    543         >>> demo_tags("<jammed att='value'att2='v2' />")
    544         input data      : <jammed att='value'att2='v2' />
    545         element names   : _111111________________________
    546         attribute names : ________111________1111________
    547         attribute values: ____________1111111_____1111___
    548         empty tag marks : ______________________________1
    549         end tags        : _______________________________
    550         start/empty tags: _11111111111111111111111111111_
    551         errors          : ___________________1____________
    552 
    553         5.  Extra whitespace in an empty tag.
    554 
    555         >>> demo_tags("<extrawhite / >")
    556         input data      : <extrawhite / >
    557         element names   : _1111111111____
    558         attribute names : _______________
    559         attribute values: _______________
    560         empty tag marks : _____________1_
    561         end tags        : _______________
    562         start/empty tags: _111111111111__
    563         errors          : _____________1__
    564 
    565         6.  Unterminated or incorrectly terminated attribute values
    566 
    567         >>> demo_tags("<badattvalues a='blud<   b='455>   ")
    568         input data      : <badattvalues a='blud<   b='455>   
    569         element names   : _111111111111______________________
    570         attribute names : ______________1__________1_________
    571         attribute values: ________________111111_____11111111
    572         empty tag marks : ___________________________________
    573         end tags        : ___________________________________
    574         start/empty tags: _111111111111111111111_111111111111
    575         errors          : _____________________11____________1
    576 
    577         7.  Unterminated tags
    578 
    579         >>> demo_tags("<unterminated a='245'  ")
    580         input data      : <unterminated a='245' 
    581         element names   : _111111111111__________
    582         attribute names : ______________1________
    583         attribute values: ________________11111__
    584         empty tag marks : _______________________
    585         end tags        : _______________________
    586         start/empty tags: _1111111111111111111111
    587         errors          : _______________________1
    588 
    589 """
    590         callouts = tag_callouts()
    591        
    592         # Delimiters for scans.
    593         DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
    594         SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
    595         AttListDelim = lex.Slash | lex.RAngle
    596        
    597         # Start the parallel parsing by inspecting the character
    598         # after the opening "<" of a tag.
    599         LAngleFollow = lex.LAngle_scope &~ CtCDPI_mask
    600         ElemNamePositions = LAngleFollow & ~lex.Slash
    601         EndTagSeconds = LAngleFollow & lex.Slash
    602        
    603         # Start Tag/Empty Element Tag Parsing
    604 
    605         # Advance all cursors by scanning through the tag name.
    606         ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
    607         # Must have at least one name character for a legal start tag.
    608         # Mark any occurrences of null names as errors.
    609         ParseError = ElemNamePositions & ElemNameFollows
    610         callouts.ElemNames = ElemNameFollows - ElemNamePositions
    611        
    612         # Initialize the accumulators for attribute name and value positions.
    613         AttNameStarts = 0 
    614         AttNameFollows = 0
    615         EqToCheck = 0
    616         AttValStarts = 0
    617         AttValEnds = 0
    618         AttValFollows = 0
    619 
    620         # After the element name, there may or may not be an attlist.
    621         AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
    622         AttListEnd = AfterWS & AttListDelim
    623         AttNameStart = AfterWS & ~AttListDelim
    624         # At least one WS character is required between ElemNames and AttNames.
    625         ParseError |= ElemNameFollows & AttNameStart
    626 
    627         #
    628         # The following loop iterates through attributes within a start tag.
    629         # Because all start tags are processed in parallel, the number of
    630         # iterations is the maximum number of attributes found in any one
    631         # start tag, plus one.
    632         while AttNameStart:
    633                 AttNameStarts |= AttNameStart
    634                 AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
    635                 AttNameFollows |= AttNameFollow
    636                 # Scan through WS to the expected '=' delimiter.
    637                 EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
    638                 EqToCheck |= EqExpected
    639                 AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
    640                 AttValStarts |= AttValPos
    641                 DQuoteAttVal = AttValPos & lex.DQuote
    642                 SQuoteAttVal = AttValPos & lex.SQuote
    643                 DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
    644                 SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
    645                 AttValEnd = DQuoteAttEnd | SQuoteAttEnd
    646                 AttValEnds |= AttValEnd
    647                 AttValFollow = bitutil.Advance(AttValEnd)
    648                 AttValFollows |= AttValFollow
    649                 AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
    650                 AttListEnd |= AfterWS & AttListDelim
    651                 AttNameStart = AfterWS & ~AttListDelim
    652 
    653         # No more attribute values to process when AttNameStart == 0.
    654 
    655         callouts.AttNames = AttNameFollows - AttNameStarts
    656         callouts.AttVals = AttValFollows - AttValStarts
    657         STagEnds = AttListEnd & lex.RAngle
    658         # Mark any "/" characters found as the ends of empty element tags.
    659         callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
    660         callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
    661 
    662         # Check for errors.
    663         ParseError |= AttValFollows & AttNameStarts # No intervening WS.
    664         ParseError |= AttNameStarts & AttNameFollows # Null AttName
    665         ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
    666         ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
    667         ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
    668         ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
    669 
    670         # End Tag Parsing
    671         EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
    672         ParseError |= EndTagEnds & ~lex.RAngle
    673         callouts.EndTags = EndTagEnds - EndTagSeconds
    674         callouts.error = ParseError
    675 
    676         # POTENTIAL ADDITIONAL FIELDS
    677         # callouts.StartTagEnds = STagEnds
    678         # callouts.EmptyTagEnds = bitutil.Advance(callouts.EmptyTagMarks)
    679         # callouts.EndTagEnds = EndTagEnds
    680        
    681         return callouts
    682 
    683 def demo_tags(u8data):
    684         lgth = len(u8data)
    685         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    686         (u8, control, lex) = byteclass.classify_bytes(bit)
    687         lex = add_multiliterals(lex)
    688         markup1 = parse_CtCDPI(lex, EOF_mask)
    689         callouts = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
    690         bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    691                             ('element names', bitutil.bitstream2string(callouts.ElemNames, lgth)),
    692                             ('attribute names', bitutil.bitstream2string(callouts.AttNames, lgth)),
    693                             ('attribute values', bitutil.bitstream2string(callouts.AttVals, lgth)),
    694                             ('empty tag marks', bitutil.bitstream2string(callouts.EmptyTagMarks, lgth)),
    695                             ('end tags', bitutil.bitstream2string(callouts.EndTags, lgth)),
    696                             ('start/empty tags', bitutil.bitstream2string(callouts.Tags, lgth)),
    697                             ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
    698 
    699 
    700 
    701 def validate_no_CD_end(lex, markup1, tags):
    702         """Find illegal occurrences of ]]> in text (outside of markup).
    703 
    704         >>> demo_validate_no_CD_end(' <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?> ')
    705         input data :  <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?>
    706         CtCDPI_mask: __1111111111111111__111111111111111_______________________________11111111111_
    707         tags       : __________________________________________1111111111111111____________________
    708         illegal ]]>: _______________________________________1______________________1_______________
    709 """
    710         ret = lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
    711         return ret
    712 
    713 def demo_validate_no_CD_end(u8data):
    714         lgth = len(u8data)
    715         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    716         (u8, control, lex) = byteclass.classify_bytes(bit)
    717         lex = add_multiliterals(lex)
    718         markup1 = parse_CtCDPI(lex, EOF_mask)
    719         tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
    720         error = validate_no_CD_end(lex, markup1, tags)
    721         bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    722                             ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
    723                             ('tags', bitutil.bitstream2string(tags.Tags, lgth)),
    724                             ('illegal ]]>', bitutil.bitstream2string(error, lgth))])
    725 
    726211
    727212
     
    898383        lex.Hyphen_scope = bitutil.Advance(lex.Hyphen)
    899384        lex.QMark_scope = bitutil.Advance(lex.QMark)
    900         lex.RBracket_scope = bitutil.Advance(lex.RBracket)
     385
     386        if lex.RBracket:
     387                lex.RBracket_scope = bitutil.Advance(lex.RBracket)
     388                lex.CD_end = bitutil.Advance(lex.RBracket_scope & lex.RBracket) & lex.RAngle
    901389
    902390        # Compute XML multilterals such as <?, </, --, ]]>.
     
    906394        lex.CtCDPI_start = lex.PI_start | lex.CtCD_start
    907395        lex.EndTag_start = lex.LAngle_scope & lex.Slash
    908         lex.CD_end = bitutil.Advance(lex.RBracket_scope & lex.RBracket) & lex.RAngle
     396
     397        #if lex.CtCDPI_start:
    909398        lex.DoubleHyphen = lex.Hyphen_scope & lex.Hyphen
    910399        lex.PI_end = lex.QMark_scope & lex.RAngle
     
    1044533        # Mark any "/" characters found as the ends of empty element tags.
    1045534        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
    1046         # Not needed for xmlwf
    1047         #callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
     535        callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
    1048536       
    1049537        # Check for errors.
     
    1098586
    1099587        # Ensure that no occurrence of ]]> occurs outside of markup.
    1100         CD_end_error = validate_no_CD_end(lex, CT_callouts, callouts)
    1101 
    1102         # Convert to UTF-16 bit streams.
    1103         #(u16hi, u16lo, u16delmask) = u8u16.u8u16(u8, bit)
     588        #CD_end_error = validate_no_CD_end(lex, CT_callouts, callouts)
     589        CD_end_error = lex.CD_end & ~(CT_callouts.CtCDPI_mask | callouts.Tags)
     590
    1104591
    1105592        # Consolidate and check for errors
    1106593        error_mask = u8.error | xmlchar_error | CT_callouts.error | callouts.error | CD_end_error | refs.error
    1107594
    1108         # Consolidate the deletion_masks
    1109         #delmask = control.CRLF | refs.delmask | u16delmask # | CT_callouts.CDATA_delimiters
    1110         #Not needed for xmlwf
    1111         #delmask = control.CRLF | refs.delmask  # | CT_callouts.CDATA_delimiters
    1112595       
    1113596        qname_stream =  callouts.ElemNames | callouts.AttNames
     
    1121604        return (CT_callouts, callouts, refs, error, lex, EOF_mask, name_check, name_start_check, control)
    1122605
    1123 def demo_parabix(u8data):
    1124 
    1125         lgth = len(u8data)
    1126        
    1127         (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask) = parabix_parse(u8data)
    1128         bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    1129                             ('input high nybbles', bitutil.high_nybble_stream(u8data)),
    1130                             ('input low nybbles', bitutil.low_nybble_stream(u8data)),
    1131                             ('CD_span', bitutil.bitstream2string(markup1.CD_span, lgth)),
    1132                             ('Ct_span', bitutil.bitstream2string(markup1.Ct_span, lgth)),
    1133                             ('PI_span', bitutil.bitstream2string(markup1.PI_span, lgth)),
    1134                             ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
    1135                             ('entity refs', bitutil.bitstream2string(refs.GenRefs, lgth)),
    1136                             ('decimal char refs', bitutil.bitstream2string(refs.DecRefs, lgth)),
    1137                             ('hex char refs', bitutil.bitstream2string(refs.HexRefs, lgth)),
    1138                             ('element names', bitutil.bitstream2string(tags.ElemNames, lgth)),
    1139                             ('attribute names', bitutil.bitstream2string(tags.AttNames, lgth)),
    1140                             ('attribute values', bitutil.bitstream2string(tags.AttVals, lgth)),
    1141                             ('empty tag marks', bitutil.bitstream2string(tags.EmptyTagMarks, lgth)),
    1142                             ('end tags', bitutil.bitstream2string(tags.EndTags, lgth)),
    1143                             ('start/empty tags', bitutil.bitstream2string(tags.Tags, lgth)),
    1144                             ('delmask', bitutil.bitstream2string(delmask, lgth)),
    1145                             ('u16delmask', bitutil.bitstream2string(u16delmask, lgth)),
    1146                             ('errors', bitutil.bitstream2string(error, lgth+1))])
    1147 
    1148 def demo_u16delmask(u8data):
    1149 
    1150         u8len = len(u8data)
    1151        
    1152         # Transpose to parallel bit streams and prepare an EOF mask.
    1153         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    1154 
    1155         # Classify bytes for UTF-8 processing, whitespace and control
    1156         # processing and XML lexical analysis.
    1157         (u8, control, lex) = byteclass.classify_bytes(bit)
    1158 
    1159         # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
    1160         u8 = u8u16.validate_utf8(u8)   
    1161        
    1162         # Convert to UTF-16 bit streams.
    1163         (u16hi, u16lo, delmask) = u8u16.u8u16(u8, bit)
    1164        
    1165         # Inverse transpose
    1166         U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
    1167         U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
    1168        
    1169         # Construct UTF-16 data buffer
    1170         bytes = bitutil.merge_bytes(U16L, U16H)
    1171        
    1172         U16data = bytes.decode('utf16')
    1173        
    1174         bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    1175                                 ('u16delmask', bitutil.bitstream2string(delmask, u8len)),               
    1176                                   ('errors', bitutil.bitstream2string(u8.error, u8len+1))])
    1177         return
    1178 
    1179 if __name__ == "__main__":
    1180         import doctest
    1181         doctest.testmod()
    1182        
    1183         if len(sys.argv) > 1:
    1184                 u8data = bitutil.readfile(sys.argv[1])
    1185 #               demo_validate_xmlchar(u8data)
    1186 #               demo_line_breaks(u8data)
    1187 #               demo_multiliterals(u8data)
    1188 #               demo_CtCDPI(u8data)
    1189 #               demo_refs(u8data)
    1190 #               demo_tags(u8data)
    1191 #               demo_validate_no_CD_end(u8data)         
    1192 #               demo_u16delmask(u8data)         
    1193                 demo_parabix(u8data)
    1194 #               demo_u16delmask(u8data)
    1195         else:
    1196                 print("Usage: python parabix2.py <file>")       
    1197                
    1198  
    1199        
    1200        
     606       
Note: See TracChangeset for help on using the changeset viewer.