source: proto/JSON/u8u16.py @ 2236

Last change on this file since 2236 was 684, checked in by ksherdy, 9 years ago

General reorganization. Addition of JSON String processing logic.

File size: 7.4 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# u8u16.py
5#
6# Python prototype implementation
7# Robert D. Cameron
8# Revised October 29, 2009 - Make consistent with character compiler UTF8 definition variable names.
9#
10#----------------------------------------------------------------------------
11#
12# We use python's unlimited precision integers for unbounded bit streams.
13# This permits simple logical operations on the entire stream.
14# Assumption: bitstreams are little-endian (e.g., as on x86).
15#
16#----------------------------------------------------------------------------
17#
18
19import bitutil
20
21import byteclass
22
23def ShiftBack(stream):
24        return stream >> 1
25
26def validate_utf8(u8): 
27        u8.scope22 = bitutil.Advance(u8.prefix2)
28        u8.scope32 = bitutil.Advance(u8.prefix3)
29        u8.scope33 = bitutil.Advance(u8.scope32)
30        u8.scope42 = bitutil.Advance(u8.prefix4)
31        u8.scope43 = bitutil.Advance(u8.scope42)
32        u8.scope44 = bitutil.Advance(u8.scope43)
33               
34        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
35        u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43
36             
37        # C0-C1 and F5-FF are illegal
38        error_mask = u8.badprefix
39             
40        error_mask |= bitutil.Advance(u8.xE0) & u8.x80_x9F
41        error_mask |= bitutil.Advance(u8.xED) & u8.xA0_xBF
42        error_mask |= bitutil.Advance(u8.xF0) & u8.x80_x8F
43        error_mask |= bitutil.Advance(u8.xF4) & u8.x90_xBF
44             
45        error_mask |= u8anyscope ^ u8.suffix
46        u8.error = error_mask   
47       
48        return u8
49
50#
51# The following calculation of UTF-16 bit streams is consistent
52# with the original u8u16, calculating streams at u8scope42 and
53# u8scope44 positions.
54#
55
56def u8_u16_old(u8, u8bit):
57        u16hi = [0,0,0,0,0,0,0,0]
58        u16lo = [0,0,0,0,0,0,0,0]
59       
60        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
61        u8lastbyte = u8.unibyte | u8lastscope
62        u16lo[2] = u8lastbyte & u8bit[2]
63        u16lo[3] = u8lastbyte & u8bit[3]
64        u16lo[4] = u8lastbyte & u8bit[4]
65        u16lo[5] = u8lastbyte & u8bit[5]
66        u16lo[6] = u8lastbyte & u8bit[6]
67        u16lo[7] = u8lastbyte & u8bit[7]
68        u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7]))
69        u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6])
70       
71        u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3])
72        u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4])
73        u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5])
74        u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4]))
75        u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5]))
76        u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6]))
77        u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7]))
78        u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2])
79
80        u8surrogate = u8.scope42 | u8.scope44
81        u16hi[0] = u16hi[0] | u8surrogate       
82        u16hi[1] = u16hi[1] | u8surrogate       
83        u16hi[3] = u16hi[3] | u8surrogate       
84        u16hi[4] = u16hi[4] | u8surrogate       
85        u16hi[5] = u16hi[5] | u8.scope44
86
87        s42lo1 = ~u8bit[3] # subtract 1
88        u16lo[1] = u16lo[1] | (u8.scope42 & s42lo1)
89        s42lo0 = u8bit[2] ^ s42lo1 # borrow *
90        u16lo[0] = u16lo[0] | (u8.scope42 & s42lo0)
91        borrow1 = s42lo1 & ~u8bit[2]
92        s42hi7 = bitutil.Advance(u8bit[7]) ^ borrow1
93        u16hi[7]= u16hi[7] | (u8.scope42 & s42hi7)
94        borrow2 = borrow1 & ~bitutil.Advance(u8bit[7])
95        s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2
96        u16hi[6] = u16hi[6] | (u8.scope42 & s42hi6)
97
98        u16lo[2] = u16lo[2] | (u8.scope42 & u8bit[4])
99        u16lo[3] = u16lo[3] | (u8.scope42 & u8bit[5])
100        u16lo[4] = u16lo[4] | (u8.scope42 & u8bit[6])
101        u16lo[5] = u16lo[5] | (u8.scope42 & u8bit[7])
102        u16lo[6] = u16lo[6] | (u8.scope42 & ShiftBack(u8bit[2]))
103        u16lo[7] = u16lo[7] | (u8.scope42 & ShiftBack(u8bit[3]))
104
105        delmask = u8.prefix | u8.scope32 | u8.scope43
106        return (u16hi, u16lo, delmask)
107
108
109
110
111#
112# The following calculation of UTF-16 bit streams uses the
113# u8scope43 position rather than the u8scope42 position for
114# the bits of the first UTF-16 code unit of a surrogate pair.
115# This requires more shifting than with the use of u8scope42,
116# but has the advantage that all shifts are in the forward
117# direction only and can hence be implemented using addition
118# on little-endian architecture.
119#
120
121def u8u16(u8, u8bit):
122        u16hi = [0,0,0,0,0,0,0,0]
123        u16lo = [0,0,0,0,0,0,0,0]
124       
125        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
126        u8lastbyte = u8.unibyte | u8lastscope
127        u16lo[2] = u8lastbyte & u8bit[2]
128        u16lo[3] = u8lastbyte & u8bit[3]
129        u16lo[4] = u8lastbyte & u8bit[4]
130        u16lo[5] = u8lastbyte & u8bit[5]
131        u16lo[6] = u8lastbyte & u8bit[6]
132        u16lo[7] = u8lastbyte & u8bit[7]
133        u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7]))
134        u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6])
135       
136        u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3])
137        u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4])
138        u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5])
139        u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4]))
140        u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5]))
141        u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6]))
142        u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7]))
143        u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2])
144
145        u8surrogate = u8.scope43 | u8.scope44
146        u16hi[0] = u16hi[0] | u8surrogate       
147        u16hi[1] = u16hi[1] | u8surrogate       
148        u16hi[3] = u16hi[3] | u8surrogate       
149        u16hi[4] = u16hi[4] | u8surrogate       
150        u16hi[5] = u16hi[5] | u8.scope44
151
152
153        s42lo1 = ~u8bit[3] # subtract 1
154        u16lo[1] = u16lo[1] | (u8.scope43 & bitutil.Advance(s42lo1))
155        s42lo0 = u8bit[2] ^ s42lo1 # borrow *
156        u16lo[0] = u16lo[0] | (u8.scope43 & bitutil.Advance(s42lo0))
157        borrow1 = s42lo1 & ~u8bit[2]
158        Advance_bit7 = bitutil.Advance(u8bit[7])
159        s42hi7 = Advance_bit7 ^ borrow1
160        u16hi[7]= u16hi[7] | (u8.scope43 & bitutil.Advance(s42hi7))
161        borrow2 = borrow1 & ~Advance_bit7
162        s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2
163        u16hi[6] = u16hi[6] | (u8.scope43 & bitutil.Advance(s42hi6))
164
165        u16lo[2] = u16lo[2] | (u8.scope43 & bitutil.Advance(u8bit[4]))
166        u16lo[3] = u16lo[3] | (u8.scope43 & bitutil.Advance(u8bit[5]))
167        u16lo[4] = u16lo[4] | (u8.scope43 & bitutil.Advance(u8bit[6]))
168        u16lo[5] = u16lo[5] | (u8.scope43 & bitutil.Advance(u8bit[7]))
169        u16lo[6] = u16lo[6] | (u8.scope43 & u8bit[2])
170        u16lo[7] = u16lo[7] | (u8.scope43 & u8bit[3])
171
172        delmask = u8.prefix | u8.scope32 | u8.scope42
173        return (u16hi, u16lo, delmask)
174
175
176
177#
178# Messages to duplicate u8u16 error reporting.
179#
180def IllegalSequenceMessage(pos):
181        return "Illegal UTF-8 sequence at position %i in source.\n" % pos
182
183def IncompleteSequenceMessage(pos):
184        return "EOF with incomplete UTF-8 sequence at position %i in source.\n" % pos
185
186
187import sys
188def main():
189
190        if len(sys.argv) < 2:
191                sys.stderr.write("Usage: u8u16.py u8file [u16file]\n")
192                exit
193        if len(sys.argv) == 3:
194                outfile = open(sys.argv[2],"w")
195        else: outfile = sys.stdout
196        u8data = bitutil.readfile(sys.argv[1])
197        u8len = len(u8data)
198        (u8bit, EOF_mask) = bitutil.transpose_streams(u8data)
199        (u8, control, lex) = byteclass.classify_bytes(u8bit)
200        u8 = validate_utf8(u8)
201        if u8.error != 0:
202                err_pos = bitutil.count_leading_zeroes(u8.error)
203                at_EOF = err_pos == len(u8data)
204                if (err_pos >= 1) and ord(u8data[err_pos-1]) >= 0xC0: err_pos -= 1
205                elif err_pos >= 2 and ord(u8data[err_pos-2]) >= 0xE0: err_pos -= 2
206                elif err_pos >= 3 and ord(u8data[err_pos-3]) >= 0xF0: err_pos -= 3     
207                if at_EOF:
208                        sys.stderr.write(IncompleteSequenceMessage(err_pos))
209                else:
210                        sys.stderr.write(IllegalSequenceMessage(err_pos))
211                u8len = err_pos
212        (u16hi, u16lo, delmask) = u8u16(u8, u8bit)
213        U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
214        U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
215        U16final = bitutil.merge_bytes(U16H, U16L)
216        outfile.write(U16final)
217        outfile.close()
218               
219if __name__ == "__main__": main()
220
221
Note: See TracBrowser for help on using the repository browser.