Changeset 279 for proto


Ignore:
Timestamp:
Aug 20, 2009, 12:27:42 PM (10 years ago)
Author:
cameron
Message:

Add parabix_parse main program.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2.py

    r278 r279  
    2424import u8u16
    2525
    26 def validate_xmlchar(u8, control, lex):
     26def validate_xmlchar(u8, control, lex, EOF_mask):
    2727        r"""Compute an error stream marking characters illegal in XML:
    2828        (1) Control characters in the range 0x00-0x1F except HT, LF, CR
     
    3535"""
    3636        EF_BF_pending = bitutil.Advance(bitutil.Advance(u8.xEF) & u8.xBF)
    37         return (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS)
     37        return (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
    3838
    3939
     
    4444        bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
    4545                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
    46                               ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex), lgth))])
     46                              ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex, EOF_mask), lgth+1))])
    4747
    4848def normalize_line_breaks(control, bit):
     
    120120        PI_mask = 0
    121121        CtCDPI_mask = 0
    122         Error = 0
     122        error = 0
    123123       
    124124def parse_CtCDPI(lex, EOF_mask):
     
    134134        PI_span    : __11111________________________________________________
    135135        CtCDPI_mask: __111111___111111111111111___1111111111111111111111111_
    136         Error      : ________________________________________________________
     136        error      : ________________________________________________________
    137137       
    138138        Comments are terminated by double-hyphen; immediately require closing ">".
     
    144144        PI_span    : _____________________________
    145145        CtCDPI_mask: __11111111111111111___1111111
    146         Error      : __________________1___________
     146        error      : __________________1___________
    147147
    148148"""
     
    190190       
    191191        callouts.CtCDPI_mask |= bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
    192         callouts.Error = Ct_ends & ~lex.RAngle
     192        callouts.error = Ct_ends & ~lex.RAngle
    193193        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
    194         callouts.Error |= callouts.CtCDPI_mask &~ EOF_mask
     194        callouts.error |= callouts.CtCDPI_mask &~ EOF_mask
    195195        return callouts
    196196
     
    207207                              ('PI_span', bitutil.bitstream2string(markup.PI_span, lgth)),
    208208                              ('CtCDPI_mask', bitutil.bitstream2string(markup.CtCDPI_mask, lgth)),
    209                               ('Error', bitutil.bitstream2string(markup.Error, lgth+1))])
     209                              ('error', bitutil.bitstream2string(markup.error, lgth+1))])
    210210
    211211
     
    217217        error = 0
    218218
    219 def ref_pass(lex, CtCDPI_mask):
     219def parse_refs(lex, CtCDPI_mask):
    220220        """Parse and call out all general and character references.
    221221        Mark all but the closing semicolon for deletion.
     
    225225        entity refs      : __11__________________
    226226        decimal char refs: _________11___________
     227        hex char refs    : _________________11___
    227228        ref delmask      : _111___1111___11111___
    228229        errors           : _______________________
     
    233234        entity refs      : ________________
    234235        decimal char refs: ________________
     236        hex char refs    : ________________
    235237        ref delmask      : _11________111__
    236238        errors           : ___1__________1__
     
    241243        entity refs      : ___111____________________111111111111
    242244        decimal char refs: __________111_________________________
     245        hex char refs    : ____________________11________________
    243246        ref delmask      : __1111__11111____11111___1111111111111
    244247        errors           : ______1______1________1_______________1
     
    276279        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    277280        (u8, control, lex) = byteclass.classify_bytes(bit)
    278         callouts = ref_pass(lex, 0)
     281        callouts = parse_refs(lex, 0)
    279282        bitutil.print_aligned_streams([('input data', u8data),
    280283                              ('entity refs', bitutil.bitstream2string(callouts.GenRefs, lgth)),
    281284                              ('decimal char refs', bitutil.bitstream2string(callouts.DecRefs, lgth)),
     285                              ('hex char refs', bitutil.bitstream2string(callouts.HexRefs, lgth)),
    282286                              ('ref delmask', bitutil.bitstream2string(callouts.delmask, lgth)),
    283287                              ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
     
    540544
    541545
     546def parabix_parse(u8data):
     547
     548        # Transpose to parallel bit streams and prepare an EOF mask.
     549        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     550
     551        # Classify bytes for UTF-8 processing, whitespace and control
     552        # processing and XML lexical analysis.
     553        (u8, control, lex) = byteclass.classify_bytes(bit)
     554
     555        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
     556        u8 = u8u16.validate_utf8(u8)
     557
     558        # Rule out the illegal characters for XML.
     559        xmlchar_error = validate_xmlchar(u8, control, lex, EOF_mask)
     560
     561        # Find and normalize bare CR or CRLF combinations.
     562        (control, bit) = normalize_line_breaks(control, bit)
     563
     564        # Compute XML multilterals such as <?, </, --, ]]>.
     565        lex = add_multiliterals(lex)
     566
     567        # Parse all comments, CDATA sections and processing instructions.
     568        markup1 = parse_CtCDPI(lex, EOF_mask)
     569
     570        # All remaining "<" must be tag start characters; parse tags.
     571        tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
     572
     573        # All remaining "&" must be reference start characters; parse them.
     574        refs = parse_refs(lex, markup1.CtCDPI_mask)
     575
     576        # Ensure that no occurrence of ]]> occurs outside of markup.
     577        CD_end_error = validate_no_CD_end(lex, markup1, tags)
     578
     579        # Convert to UTF-16 bit streams.
     580        (u16hi, u16lo, u16delmask) = u8u16.u8u16(u8, bit)
     581
     582        # Consolidate and check for errors
     583        error = u8.error | xmlchar_error | markup1.error | tags.error | CD_end_error | refs.error
     584
     585        # Consolidate the deletion_masks
     586        delmask = control.CRLF | refs.delmask | u16delmask # | markup1.CDATA_delimiters
     587
     588        return (markup1, tags, refs, u16hi, u16lo, delmask, error)
     589
     590
     591def demo_parabix(u8data):
     592        lgth = len(u8data)
     593        (markup1, tags, refs, u16hi, u16lo, delmask, error) = parabix_parse(u8data)
     594        bitutil.print_aligned_streams([('input data', u8data),
     595                              ('input high nybbles', bitutil.high_nybble_stream(u8data)),
     596                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
     597                              ('CD_span', bitutil.bitstream2string(markup1.CD_span, lgth)),
     598                              ('Ct_span', bitutil.bitstream2string(markup1.Ct_span, lgth)),
     599                              ('PI_span', bitutil.bitstream2string(markup1.PI_span, lgth)),
     600                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
     601                              ('entity refs', bitutil.bitstream2string(refs.GenRefs, lgth)),
     602                              ('decimal char refs', bitutil.bitstream2string(refs.DecRefs, lgth)),
     603                              ('hex char refs', bitutil.bitstream2string(refs.HexRefs, lgth)),
     604                              ('element names', bitutil.bitstream2string(tags.ElemNames, lgth)),
     605                              ('attribute names', bitutil.bitstream2string(tags.AttNames, lgth)),
     606                              ('attribute values', bitutil.bitstream2string(tags.AttVals, lgth)),
     607                              ('empty tag marks', bitutil.bitstream2string(tags.EmptyTagMarks, lgth)),
     608                              ('end tags', bitutil.bitstream2string(tags.EndTags, lgth)),
     609                              ('start/empty tags', bitutil.bitstream2string(tags.Tags, lgth)),
     610                              ('delmask', bitutil.bitstream2string(delmask, lgth)),
     611                              ('errors', bitutil.bitstream2string(error, lgth+1))])
     612       
     613
     614
     615
    542616if __name__ == "__main__":
    543617    import doctest
Note: See TracChangeset for help on using the changeset viewer.