source: proto/CSV/csv.py @ 491

Last change on this file since 491 was 372, checked in by lindanl, 10 years ago

CSV parsing prototype with backslash escape convention

File size: 2.4 KB
Line 
1#
2# csv.py
3#
4#
5# Dan Lin
6# March 4, 2010
7#
8#----------------------------------------------------------------------------
9#
10# We use python's unlimited precision integers for unbounded bit streams.
11# This permits simple logical operations on the entire stream.
12# Assumption: bitstreams are little-endian (e.g., as on x86).
13#
14#----------------------------------------------------------------------------
15#
16
17
18import bitutil
19
20import csvclass
21       
22import sys
23
24def simd_const_4(hexdigit,EOF_mask):
25        lgth = bitutil.count_leading_zeroes(~EOF_mask)/4
26        return int(hexdigit*(lgth+1),16)&EOF_mask
27
28def parse_escape(lex, EOF_mask):
29        odd = simd_const_4('a',EOF_mask)
30        even = simd_const_4('5',EOF_mask)
31       
32        start = lex.BackSlash &~ bitutil.Advance(lex.BackSlash)
33        even_start = start & even
34
35        even_final = (even_start + lex.BackSlash) & ~lex.BackSlash
36        escape = even_final & odd
37       
38        odd_start = start & odd
39        odd_final = (odd_start + lex.BackSlash) & ~lex.BackSlash
40        escape = escape | (odd_final & even)
41       
42        return escape
43       
44def parse_quote(quote,EOF_mask):       
45        cursor = 1
46        quote_start = 0
47        quote_end = 0
48        while(cursor):
49                cursor =  bitutil.ScanThru(cursor, (~quote)&EOF_mask)
50                quote_start |= cursor
51                cursor = bitutil.Advance(cursor)
52                cursor =  bitutil.ScanThru(cursor, (~quote)&EOF_mask)
53                quote_end |= cursor
54                cursor = bitutil.Advance(cursor)&EOF_mask
55        quote_mask = quote_end-quote_start
56        return  quote_mask
57       
58       
59       
60def csv_parse(u8data):
61        # Transpose to parallel bit streams and prepare an EOF mask.
62        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
63        lex = csvclass.classify_bytes(bit)
64        escape = parse_escape(lex, EOF_mask)   
65        quote = lex.DQuote &~ escape
66        quote_mask = parse_quote(quote, EOF_mask)
67        eol = (lex.CR&~quote_mask) | (lex.LF&~lex.CR&~quote_mask)
68        delim = lex.Comma&~quote_mask
69        return (escape, quote, eol, delim)
70
71
72def demo_csv(u8data):
73
74        lgth = len(u8data)
75       
76        (escape, quote, eol, delim) = csv_parse(u8data)
77        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
78                              ('escape', bitutil.bitstream2string(escape, lgth)),
79                              ('quote', bitutil.bitstream2string(quote, lgth)),
80                              ('eol', bitutil.bitstream2string(eol, lgth)),
81                              ('delim', bitutil.bitstream2string(delim, lgth))])
82
83
84if __name__ == "__main__":
85        import doctest
86        doctest.testmod()
87       
88        if len(sys.argv) > 1:
89                u8data = bitutil.readfile(sys.argv[1]) 
90
91                demo_csv(u8data)
92        else:
93                print("Usage: python csv.py <file>")   
94               
95 
96       
97       
Note: See TracBrowser for help on using the repository browser.