Changeset 315 for proto/parabix2


Ignore:
Timestamp:
Sep 25, 2009, 6:05:17 PM (10 years ago)
Author:
ksherdy
Message:

Revert explicit conversion to UTF-8 on file read.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2.py

    r304 r315  
    4242def demo_validate_xmlchar(u8data):
    4343        lgth = len(u8data)
    44         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    45         (u8, control, lex) = byteclass.classify_bytes(bit)
    46         bitutil.print_aligned_u8_byte_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
     44        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     45        (u8, control, lex) = byteclass.classify_bytes(bit)
     46        bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)),
    4747                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
    4848                              ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex, EOF_mask), lgth+1))])
     
    6666
    6767def demo_line_breaks(u8data):
    68         u8data = u8data.encode('utf-8')
    69         lgth = len(u8data)
    70         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     68        lgth = len(u8data)
     69        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    7170        (u8, control, lex) = byteclass.classify_bytes(bit)
    7271        (control, bit) = normalize_line_breaks(control, bit)
     
    105104
    106105def demo_multiliterals(u8data):
    107         u8data = u8data.encode('utf-8')
    108         lgth = len(u8data)
    109         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     106        lgth = len(u8data)
     107        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    110108        (u8, control, lex) = byteclass.classify_bytes(bit)
    111109        lex = add_multiliterals(lex)
     
    199197
    200198def demo_CtCDPI(u8data):
    201         u8data = u8data.encode('utf-8')
    202         lgth = len(u8data)
    203         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     199        lgth = len(u8data)
     200        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    204201        (u8, control, lex) = byteclass.classify_bytes(bit)
    205202        lex = add_multiliterals(lex)
     
    278275
    279276def demo_refs(u8data):
    280         u8data = u8data.encode('utf-8')
    281         lgth = len(u8data)
    282         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     277        lgth = len(u8data)
     278        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    283279        (u8, control, lex) = byteclass.classify_bytes(bit)
    284280        callouts = parse_refs(lex, 0)
     
    299295        EndTags = 0
    300296        error = 0
     297       
     298        # POTENTIAL ADDITIONAL FIELDS
     299        # StartTagEnds = 0
     300        # EmptyTagEnds = 0     
     301        # EndTagEnds = 0
    301302
    302303def parse_tags(lex, CtCDPI_mask, EOF_mask):
     
    498499        callouts.EndTags = EndTagEnds - EndTagSeconds
    499500        callouts.error = ParseError
     501
     502        # POTENTIAL ADDITIONAL FIELDS
     503        # callouts.StartTagEnds = STagEnds
     504        # callouts.EmptyTagEnds = bitutil.Advance(callouts.EmptyTagMarks)
     505        # callouts.EndTagEnds = EndTagEnds
    500506       
    501507        return callouts
    502508
    503509def demo_tags(u8data):
    504         u8data = u8data.encode('utf-8')
    505         lgth = len(u8data)
    506         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     510        lgth = len(u8data)
     511        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    507512        (u8, control, lex) = byteclass.classify_bytes(bit)
    508513        lex = add_multiliterals(lex)
     
    532537
    533538def demo_validate_no_CD_end(u8data):
    534         u8data = u8data.encode('utf-8')
    535         lgth = len(u8data)
    536         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     539        lgth = len(u8data)
     540        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    537541        (u8, control, lex) = byteclass.classify_bytes(bit)
    538542        lex = add_multiliterals(lex)
     
    548552
    549553def parabix_parse(u8data):
    550         # transpose_u8_byte_streams to parallel bit streams and prepare an EOF mask.
    551         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
     554        # Transpose to parallel bit streams and prepare an EOF mask.
     555        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
    552556
    553557        # Classify bytes for UTF-8 processing, whitespace and control
     
    590594        return (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
    591595
    592 def demo_u16delmask(u8data):   
    593         u8data = u8data.encode('utf-8')
    594         u8len = len(u8data)
    595        
    596         # transpose_u8_byte_streams to parallel bit streams and prepare an EOF mask.
    597         (bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
    598 
    599         # Classify bytes for UTF-8 processing, whitespace and control
    600         # processing and XML lexical analysis.
    601         (u8, control, lex) = byteclass.classify_bytes(bit)
    602 
    603         # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
    604         u8 = u8u16.validate_utf8(u8)   
    605        
    606         # Convert to UTF-16 bit streams.
    607         (u16hi, u16lo, delmask) = u8u16.u8u16(u8, bit)
    608        
    609         # Inverse transpose_u8_byte_streams
    610         u16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
    611         u16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
    612        
    613         # Construct UTF-16 data buffer
    614         u16bytes = bitutil.merge_bytes(u16L, u16H)
    615        
    616         # Convert UTF-16 bytes to Python Unicode string to UTF-8 bytes
    617         bitutil.print_aligned_u8_byte_streams([('UTF-8 Data', u8data),
    618                                         ('UTF-16 Data', u16bytes.decode('utf16').encode('utf-8')),                                         
    619                                 ('u16delmask', bitutil.bitstream2string(delmask, u8len)),               
    620                                     ('errors', bitutil.bitstream2string(u8.error, u8len+1))])
    621            
    622         return
    623596
    624597def demo_parabix(u8data):
    625         u8data = u8data.encode('utf-8')
     598
    626599        lgth = len(u8data)
    627600       
    628601        (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask) = parabix_parse(u8data)
    629         max_label = len('input high nybbles')
    630602        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
    631603                              ('input high nybbles', bitutil.high_nybble_stream(u8data)),
     
    648620                              ('errors', bitutil.bitstream2string(error, lgth+1))])
    649621
     622def demo_u16delmask(u8data):
     623
     624        u8len = len(u8data)
     625       
     626        # Transpose to parallel bit streams and prepare an EOF mask.
     627        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
     628
     629        # Classify bytes for UTF-8 processing, whitespace and control
     630        # processing and XML lexical analysis.
     631        (u8, control, lex) = byteclass.classify_bytes(bit)
     632
     633        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
     634        u8 = u8u16.validate_utf8(u8)   
     635       
     636        # Convert to UTF-16 bit streams.
     637        (u16hi, u16lo, delmask) = u8u16.u8u16(u8, bit)
     638       
     639        # Inverse transpose
     640        U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
     641        U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
     642       
     643        # Construct UTF-16 data buffer
     644        bytes = bitutil.merge_bytes(U16L, U16H)
     645       
     646        U16data = bytes.decode('utf16')
     647       
     648        bitutil.print_aligned_u8_byte_streams([('input data', u8data),
     649                                ('u16delmask', bitutil.bitstream2string(delmask, u8len)),               
     650                                    ('errors', bitutil.bitstream2string(u8.error, u8len+1))])
     651        return
     652
    650653if __name__ == "__main__":
    651654        import doctest
     
    653656       
    654657        if len(sys.argv) > 1:
    655                 # Read file as UTF-8 and convert to UTF-8 bytes
    656                 u8data = bitutil.readfile(sys.argv[1], 'utf-8')
    657                 #demo_validate_xmlchar(u8data)
    658                 #demo_line_breaks(u8data)
    659                 #demo_multiliterals(u8data)
    660                 #demo_CtCDPI(u8data)
    661                 #demo_refs(u8data)
    662                 #demo_tags(u8data)
    663                 #demo_validate_no_CD_end(u8data)               
    664                 #demo_u16delmask(u8data)               
     658                u8data = bitutil.readfile(sys.argv[1])
     659#               demo_validate_xmlchar(u8data)
     660#               demo_line_breaks(u8data)
     661#               demo_multiliterals(u8data)
     662#               demo_CtCDPI(u8data)
     663#               demo_refs(u8data)
     664#               demo_tags(u8data)
     665#               demo_validate_no_CD_end(u8data)         
     666#               demo_u16delmask(u8data)         
    665667                demo_parabix(u8data)
    666                
     668#               demo_u16delmask(u8data)
    667669        else:
    668670                print("Usage: python parabix2.py <file>")       
Note: See TracChangeset for help on using the changeset viewer.