Changeset 303


Ignore:
Timestamp:
Sep 16, 2009, 10:43:49 AM (10 years ago)
Author:
ksherdy
Message:

Updated all demo methods to explicitly encode u8data as utf-8.
Updated all demo methods to display utf-8 aligned byte streams.
Added UTF-16 deletion mask demo.
Added support to read from UTF-8 encoded source files in main.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2.py

    r292 r303  
    2424import u8u16
    2525
     26import sys
     27
    2628def validate_xmlchar(u8, control, lex, EOF_mask):
    2729        r"""Compute an error stream marking characters illegal in XML:
     
    3941
    4042def demo_validate_xmlchar(u8data):
    41         lgth = len(u8data)
    42         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    43         (u8, control, lex) = byteclass.classify_bytes(bit)
    44         bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
     43        u8data = u8data.encode('utf-8')
     44        lgth = len(u8data)
     45        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     46        (u8, control, lex) = byteclass.classify_bytes(bit)
     47        bitutil.print_aligned_u8_byte_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
    4548                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
    4649                              ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex, EOF_mask), lgth+1))])
     
    6467
    6568def demo_line_breaks(u8data):
    66         lgth = len(u8data)
    67         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     69        u8data = u8data.encode('utf-8')
     70        lgth = len(u8data)
     71        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    6872        (u8, control, lex) = byteclass.classify_bytes(bit)
    6973        (control, bit) = normalize_line_breaks(control, bit)
    70         bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
     74        bitutil.print_aligned_u8_byte_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
    7175                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
    7276                              ('CR', bitutil.bitstream2string(control.CR, lgth)),
     
    101105        return lex
    102106
    103 
    104107def demo_multiliterals(u8data):
    105         lgth = len(u8data)
    106         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     108        u8data = u8data.encode('utf-8')
     109        lgth = len(u8data)
     110        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    107111        (u8, control, lex) = byteclass.classify_bytes(bit)
    108112        lex = add_multiliterals(lex)
    109         bitutil.print_aligned_streams([('input data', u8data),
     113        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    110114                              ('PI_start', bitutil.bitstream2string(lex.PI_start, lgth)),
    111115                              ('CtCD_start', bitutil.bitstream2string(lex.CtCD_start, lgth)),
     
    195199        return callouts
    196200
    197 
    198201def demo_CtCDPI(u8data):
    199         lgth = len(u8data)
    200         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     202        u8data = u8data.encode('utf-8')
     203        lgth = len(u8data)
     204        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    201205        (u8, control, lex) = byteclass.classify_bytes(bit)
    202206        lex = add_multiliterals(lex)
    203207        markup = parse_CtCDPI(lex, EOF_mask)
    204         bitutil.print_aligned_streams([('input data', u8data),
     208        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    205209                              ('CD_span', bitutil.bitstream2string(markup.CD_span, lgth)),
    206210                              ('Ct_span', bitutil.bitstream2string(markup.Ct_span, lgth)),
     
    274278        return CallOuts
    275279
    276 
    277280def demo_refs(u8data):
    278         lgth = len(u8data)
    279         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     281        u8data = u8data.encode('utf-8')
     282        lgth = len(u8data)
     283        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    280284        (u8, control, lex) = byteclass.classify_bytes(bit)
    281285        callouts = parse_refs(lex, 0)
    282         bitutil.print_aligned_streams([('input data', u8data),
     286        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    283287                              ('entity refs', bitutil.bitstream2string(callouts.GenRefs, lgth)),
    284288                              ('decimal char refs', bitutil.bitstream2string(callouts.DecRefs, lgth)),
     
    498502        return callouts
    499503
    500 
    501504def demo_tags(u8data):
    502         lgth = len(u8data)
    503         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     505        u8data = u8data.encode('utf-8')
     506        lgth = len(u8data)
     507        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    504508        (u8, control, lex) = byteclass.classify_bytes(bit)
    505509        lex = add_multiliterals(lex)
    506510        markup1 = parse_CtCDPI(lex, EOF_mask)
    507511        callouts = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
    508         bitutil.print_aligned_streams([('input data', u8data),
     512        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    509513                              ('element names', bitutil.bitstream2string(callouts.ElemNames, lgth)),
    510514                              ('attribute names', bitutil.bitstream2string(callouts.AttNames, lgth)),
     
    528532        return lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
    529533
    530 
    531534def demo_validate_no_CD_end(u8data):
    532         lgth = len(u8data)
    533         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     535        u8data = u8data.encode('utf-8')
     536        lgth = len(u8data)
     537        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    534538        (u8, control, lex) = byteclass.classify_bytes(bit)
    535539        lex = add_multiliterals(lex)
     
    537541        tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
    538542        error = validate_no_CD_end(lex, markup1, tags)
    539         bitutil.print_aligned_streams([('input data', u8data),
     543        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    540544                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
    541545                              ('tags', bitutil.bitstream2string(tags.Tags, lgth)),
     
    545549
    546550def parabix_parse(u8data):
    547 
    548         # Transpose to parallel bit streams and prepare an EOF mask.
    549         (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     551        # transpose_u8_byte_streams to parallel bit streams and prepare an EOF mask.
     552        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    550553
    551554        # Classify bytes for UTF-8 processing, whitespace and control
     
    586589        delmask = control.CRLF | refs.delmask | u16delmask # | markup1.CDATA_delimiters
    587590
    588         return (markup1, tags, refs, u16hi, u16lo, delmask, error)
    589 
     591        return (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
     592
     593def demo_u16delmask(u8data):   
     594        u8data = u8data.encode('utf-8')
     595        u8len = len(u8data)
     596       
     597        # transpose_u8_byte_streams to parallel bit streams and prepare an EOF mask.
     598        (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     599
     600        # Classify bytes for UTF-8 processing, whitespace and control
     601        # processing and XML lexical analysis.
     602        (u8, control, lex) = byteclass.classify_bytes(bit)
     603
     604        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
     605        u8 = u8u16.validate_utf8(u8)   
     606       
     607        # Convert to UTF-16 bit streams.
     608        (u16hi, u16lo, delmask) = u8u16.u8u16(u8, bit)
     609       
     610        # Inverse transpose_u8_byte_streams
     611        u16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
     612        u16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
     613       
     614        # Construct UTF-16 data buffer
     615        u16bytes = bitutil.merge_bytes(u16L, u16H)
     616       
     617        # Convert UTF-16 bytes to Python Unicode string to UTF-8 bytes
     618        bitutil.print_aligned_u8_byte_streams([('UTF-8 Data', u8data),
     619                                        ('UTF-16 Data', u16bytes.decode('utf16').encode('utf-8')),                                         
     620                                ('u16delmask', bitutil.bitstream2string(delmask, u8len)),               
     621                                    ('errors', bitutil.bitstream2string(u8.error, u8len+1))])
     622           
     623        return
    590624
    591625def demo_parabix(u8data):
    592         lgth = len(u8data)
    593         (markup1, tags, refs, u16hi, u16lo, delmask, error) = parabix_parse(u8data)
    594         bitutil.print_aligned_streams([('input data', u8data),
     626        u8data = u8data.encode('utf-8')
     627        lgth = len(u8data)
     628       
     629        (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask) = parabix_parse(u8data)
     630        max_label = len('input high nybbles')
     631        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    595632                              ('input high nybbles', bitutil.high_nybble_stream(u8data)),
    596633                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
     
    609646                              ('start/empty tags', bitutil.bitstream2string(tags.Tags, lgth)),
    610647                              ('delmask', bitutil.bitstream2string(delmask, lgth)),
     648                              ('u16delmask', bitutil.bitstream2string(u16delmask, lgth)),
    611649                              ('errors', bitutil.bitstream2string(error, lgth+1))])
    612        
    613 
    614 
    615650
    616651if __name__ == "__main__":
    617     import doctest
    618     doctest.testmod()
     652        import doctest
     653        doctest.testmod()
     654       
     655        if len(sys.argv) > 1:
     656                # Read file as UTF-8 and convert to UTF-8 bytes
     657                u8data = bitutil.readfile(sys.argv[1], 'utf-8')
     658                #demo_validate_xmlchar(u8data)
     659                #demo_line_breaks(u8data)
     660                #demo_multiliterals(u8data)
     661                #demo_CtCDPI(u8data)
     662                #demo_refs(u8data)
     663                #demo_tags(u8data)
     664                #demo_validate_no_CD_end(u8data)               
     665                #demo_u16delmask(u8data)               
     666                demo_parabix(u8data)
     667               
     668        else:
     669                print("Usage: python parabix2.py <file>")       
     670               
     671 
     672       
     673       
Note: See TracChangeset for help on using the changeset viewer.