source: proto/parabix2/u8u16.py @ 577

Last change on this file since 577 was 440, checked in by cameron, 9 years ago

Move u8.scopeXX variable inits into byteclass.py

  • Property svn:executable set to *
File size: 7.0 KB
Line 
1#!/usr/bin/python
2
3# u8u16.py
4#
5# Python prototype implementation
6# Robert D. Cameron
7# revised August 20, 2009 - make consistent with parabix2 prototype
8#
9#----------------------------------------------------------------------------
10#
11# We use python's unlimited precision integers for unbounded bit streams.
12# This permits simple logical operations on the entire stream.
13# Assumption: bitstreams are little-endian (e.g., as on x86).
14#
15#----------------------------------------------------------------------------
16#
17
18import bitutil
19
20import byteclass
21
22def ShiftBack(stream):
23        return stream >> 1
24
25def validate_utf8(u8):
26        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
27        u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43
28       
29        # C0-C1 and F5-FF are illegal
30        error_mask = u8.badprefix
31       
32        error_mask |= u8.xE0_scope & u8.x80_x9F
33        error_mask |= u8.xED_scope & u8.xA0_xBF
34        error_mask |= u8.xF0_scope & u8.x80_x8F
35        error_mask |= u8.xF4_scope & u8.x90_xBF
36       
37        error_mask |= u8anyscope ^ u8.suffix
38        u8.error = error_mask
39        return u8
40
41
42#
43# The following calculation of UTF-16 bit streams is consistent
44# with the original u8u16, calculating streams at u8scope42 and
45# u8scope44 positions.
46#
47
48def u8_u16_old(u8, u8bit):
49        u16hi = [0,0,0,0,0,0,0,0]
50        u16lo = [0,0,0,0,0,0,0,0]
51       
52        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
53        u8lastbyte = u8.unibyte | u8lastscope
54        u16lo[2] = u8lastbyte & u8bit[2]
55        u16lo[3] = u8lastbyte & u8bit[3]
56        u16lo[4] = u8lastbyte & u8bit[4]
57        u16lo[5] = u8lastbyte & u8bit[5]
58        u16lo[6] = u8lastbyte & u8bit[6]
59        u16lo[7] = u8lastbyte & u8bit[7]
60        u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7]))
61        u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6])
62       
63        u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3])
64        u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4])
65        u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5])
66        u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4]))
67        u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5]))
68        u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6]))
69        u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7]))
70        u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2])
71
72        u8surrogate = u8.scope42 | u8.scope44
73        u16hi[0] = u16hi[0] | u8surrogate       
74        u16hi[1] = u16hi[1] | u8surrogate       
75        u16hi[3] = u16hi[3] | u8surrogate       
76        u16hi[4] = u16hi[4] | u8surrogate       
77        u16hi[5] = u16hi[5] | u8.scope44
78
79        s42lo1 = ~u8bit[3] # subtract 1
80        u16lo[1] = u16lo[1] | (u8.scope42 & s42lo1)
81        s42lo0 = u8bit[2] ^ s42lo1 # borrow *
82        u16lo[0] = u16lo[0] | (u8.scope42 & s42lo0)
83        borrow1 = s42lo1 & ~u8bit[2]
84        s42hi7 = bitutil.Advance(u8bit[7]) ^ borrow1
85        u16hi[7]= u16hi[7] | (u8.scope42 & s42hi7)
86        borrow2 = borrow1 & ~bitutil.Advance(u8bit[7])
87        s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2
88        u16hi[6] = u16hi[6] | (u8.scope42 & s42hi6)
89
90        u16lo[2] = u16lo[2] | (u8.scope42 & u8bit[4])
91        u16lo[3] = u16lo[3] | (u8.scope42 & u8bit[5])
92        u16lo[4] = u16lo[4] | (u8.scope42 & u8bit[6])
93        u16lo[5] = u16lo[5] | (u8.scope42 & u8bit[7])
94        u16lo[6] = u16lo[6] | (u8.scope42 & ShiftBack(u8bit[2]))
95        u16lo[7] = u16lo[7] | (u8.scope42 & ShiftBack(u8bit[3]))
96
97        delmask = u8.prefix | u8.scope32 | u8.scope43
98        return (u16hi, u16lo, delmask)
99
100
101
102
103#
104# The following calculation of UTF-16 bit streams uses the
105# u8scope43 position rather than the u8scope42 position for
106# the bits of the first UTF-16 code unit of a surrogate pair.
107# This requires more shifting than with the use of u8scope42,
108# but has the advantage that all shifts are in the forward
109# direction only and can hence be implemented using addition
110# on little-endian architecture.
111#
112
113def u8u16(u8, u8bit):
114        u16hi = [0,0,0,0,0,0,0,0]
115        u16lo = [0,0,0,0,0,0,0,0]
116       
117        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
118        u8lastbyte = u8.unibyte | u8lastscope
119        u16lo[2] = u8lastbyte & u8bit[2]
120        u16lo[3] = u8lastbyte & u8bit[3]
121        u16lo[4] = u8lastbyte & u8bit[4]
122        u16lo[5] = u8lastbyte & u8bit[5]
123        u16lo[6] = u8lastbyte & u8bit[6]
124        u16lo[7] = u8lastbyte & u8bit[7]
125        u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7]))
126        u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6])
127       
128        u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3])
129        u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4])
130        u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5])
131        u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4]))
132        u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5]))
133        u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6]))
134        u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7]))
135        u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2])
136
137        u8surrogate = u8.scope43 | u8.scope44
138        u16hi[0] = u16hi[0] | u8surrogate       
139        u16hi[1] = u16hi[1] | u8surrogate       
140        u16hi[3] = u16hi[3] | u8surrogate       
141        u16hi[4] = u16hi[4] | u8surrogate       
142        u16hi[5] = u16hi[5] | u8.scope44
143
144
145        s42lo1 = ~u8bit[3] # subtract 1
146        u16lo[1] = u16lo[1] | (u8.scope43 & bitutil.Advance(s42lo1))
147        s42lo0 = u8bit[2] ^ s42lo1 # borrow *
148        u16lo[0] = u16lo[0] | (u8.scope43 & bitutil.Advance(s42lo0))
149        borrow1 = s42lo1 & ~u8bit[2]
150        Advance_bit7 = bitutil.Advance(u8bit[7])
151        s42hi7 = Advance_bit7 ^ borrow1
152        u16hi[7]= u16hi[7] | (u8.scope43 & bitutil.Advance(s42hi7))
153        borrow2 = borrow1 & ~Advance_bit7
154        s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2
155        u16hi[6] = u16hi[6] | (u8.scope43 & bitutil.Advance(s42hi6))
156
157        u16lo[2] = u16lo[2] | (u8.scope43 & bitutil.Advance(u8bit[4]))
158        u16lo[3] = u16lo[3] | (u8.scope43 & bitutil.Advance(u8bit[5]))
159        u16lo[4] = u16lo[4] | (u8.scope43 & bitutil.Advance(u8bit[6]))
160        u16lo[5] = u16lo[5] | (u8.scope43 & bitutil.Advance(u8bit[7]))
161        u16lo[6] = u16lo[6] | (u8.scope43 & u8bit[2])
162        u16lo[7] = u16lo[7] | (u8.scope43 & u8bit[3])
163
164        delmask = u8.prefix | u8.scope32 | u8.scope42
165        return (u16hi, u16lo, delmask)
166
167
168
169#
170# Messages to duplicate u8u16 error reporting.
171#
172def IllegalSequenceMessage(pos):
173        return "Illegal UTF-8 sequence at position %i in source.\n" % pos
174
175def IncompleteSequenceMessage(pos):
176        return "EOF with incomplete UTF-8 sequence at position %i in source.\n" % pos
177
178
179import sys
180def main():
181
182        if len(sys.argv) < 2:
183                sys.stderr.write("Usage: u8u16.py u8file [u16file]\n")
184                exit
185        if len(sys.argv) == 3:
186                outfile = open(sys.argv[2],"w")
187        else: outfile = sys.stdout
188        u8data = bitutil.readfile(sys.argv[1])
189        u8len = len(u8data)
190        (u8bit, EOF_mask) = bitutil.transpose_streams(u8data)
191        (u8, control, lex) = byteclass.classify_bytes(u8bit)
192        u8 = validate_utf8(u8)
193        if u8.error != 0:
194                err_pos = bitutil.count_leading_zeroes(u8.error)
195                at_EOF = err_pos == len(u8data)
196                if (err_pos >= 1) and ord(u8data[err_pos-1]) >= 0xC0: err_pos -= 1
197                elif err_pos >= 2 and ord(u8data[err_pos-2]) >= 0xE0: err_pos -= 2
198                elif err_pos >= 3 and ord(u8data[err_pos-3]) >= 0xF0: err_pos -= 3     
199                if at_EOF:
200                        sys.stderr.write(IncompleteSequenceMessage(err_pos))
201                else:
202                        sys.stderr.write(IllegalSequenceMessage(err_pos))
203                u8len = err_pos
204        (u16hi, u16lo, delmask) = u8u16(u8, u8bit)
205        U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
206        U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
207        U16final = bitutil.merge_bytes(U16H, U16L)
208        outfile.write(U16final)
209        outfile.close()
210               
211if __name__ == "__main__": main()
212
213
Note: See TracBrowser for help on using the repository browser.