source: proto/parabix2/u8u16.py @ 310

Last change on this file since 310 was 310, checked in by cameron, 10 years ago

Use raw byte readfile; fix advance_bit7 temp.

  • Property svn:executable set to *
File size: 7.4 KB
Line 
1#!/usr/bin/python
2
3# u8u16.py
4#
5# Python prototype implementation
6# Robert D. Cameron
7# revised August 20, 2009 - make consistent with parabix2 prototype
8#
9#----------------------------------------------------------------------------
10#
11# We use python's unlimited precision integers for unbounded bit streams.
12# This permits simple logical operations on the entire stream.
13# Assumption: bitstreams are little-endian (e.g., as on x86).
14#
15#----------------------------------------------------------------------------
16#
17
18import bitutil
19
20import byteclass
21
22def readfile(filename):
23        f = open(filename, 'r')
24        contents = f.read()
25        f.close()
26        return contents
27
28
29def ShiftBack(stream):
30        return stream >> 1
31
32def validate_utf8(u8):
33        u8.scope22 = bitutil.Advance(u8.prefix2)
34        u8.scope32 = bitutil.Advance(u8.prefix3)
35        u8.scope33 = bitutil.Advance(u8.scope32)
36        u8.scope42 = bitutil.Advance(u8.prefix4)
37        u8.scope43 = bitutil.Advance(u8.scope42)
38        u8.scope44 = bitutil.Advance(u8.scope43)
39        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
40        u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43
41       
42        # C0-C1 and F5-FF are illegal
43        error_mask = u8.badprefix
44       
45        error_mask |= bitutil.Advance(u8.xE0) & u8.x80_x9F
46        error_mask |= bitutil.Advance(u8.xED) & u8.xA0_xBF
47        error_mask |= bitutil.Advance(u8.xF0) & u8.x80_x8F
48        error_mask |= bitutil.Advance(u8.xF4) & u8.x90_xBF
49       
50        error_mask |= u8anyscope ^ u8.suffix
51        u8.error = error_mask
52        return u8
53
54
55#
56# The following calculation of UTF-16 bit streams is consistent
57# with the original u8u16, calculating streams at u8scope42 and
58# u8scope44 positions.
59#
60
61def u8_u16_old(u8, u8bit):
62        u16hi = [0,0,0,0,0,0,0,0]
63        u16lo = [0,0,0,0,0,0,0,0]
64       
65        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
66        u8lastbyte = u8.unibyte | u8lastscope
67        u16lo[2] = u8lastbyte & u8bit[2]
68        u16lo[3] = u8lastbyte & u8bit[3]
69        u16lo[4] = u8lastbyte & u8bit[4]
70        u16lo[5] = u8lastbyte & u8bit[5]
71        u16lo[6] = u8lastbyte & u8bit[6]
72        u16lo[7] = u8lastbyte & u8bit[7]
73        u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7]))
74        u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6])
75       
76        u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3])
77        u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4])
78        u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5])
79        u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4]))
80        u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5]))
81        u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6]))
82        u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7]))
83        u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2])
84
85        u8surrogate = u8.scope42 | u8.scope44
86        u16hi[0] = u16hi[0] | u8surrogate       
87        u16hi[1] = u16hi[1] | u8surrogate       
88        u16hi[3] = u16hi[3] | u8surrogate       
89        u16hi[4] = u16hi[4] | u8surrogate       
90        u16hi[5] = u16hi[5] | u8.scope44
91
92        s42lo1 = ~u8bit[3] # subtract 1
93        u16lo[1] = u16lo[1] | (u8.scope42 & s42lo1)
94        s42lo0 = u8bit[2] ^ s42lo1 # borrow *
95        u16lo[0] = u16lo[0] | (u8.scope42 & s42lo0)
96        borrow1 = s42lo1 & ~u8bit[2]
97        s42hi7 = bitutil.Advance(u8bit[7]) ^ borrow1
98        u16hi[7]= u16hi[7] | (u8.scope42 & s42hi7)
99        borrow2 = borrow1 & ~bitutil.Advance(u8bit[7])
100        s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2
101        u16hi[6] = u16hi[6] | (u8.scope42 & s42hi6)
102
103        u16lo[2] = u16lo[2] | (u8.scope42 & u8bit[4])
104        u16lo[3] = u16lo[3] | (u8.scope42 & u8bit[5])
105        u16lo[4] = u16lo[4] | (u8.scope42 & u8bit[6])
106        u16lo[5] = u16lo[5] | (u8.scope42 & u8bit[7])
107        u16lo[6] = u16lo[6] | (u8.scope42 & ShiftBack(u8bit[2]))
108        u16lo[7] = u16lo[7] | (u8.scope42 & ShiftBack(u8bit[3]))
109
110        delmask = u8.prefix | u8.scope32 | u8.scope43
111        return (u16hi, u16lo, delmask)
112
113
114
115
116#
117# The following calculation of UTF-16 bit streams uses the
118# u8scope43 position rather than the u8scope42 position for
119# the bits of the first UTF-16 code unit of a surrogate pair.
120# This requires more shifting than with the use of u8scope42,
121# but has the advantage that all shifts are in the forward
122# direction only and can hence be implemented using addition
123# on little-endian architecture.
124#
125
126def u8u16(u8, u8bit):
127        u16hi = [0,0,0,0,0,0,0,0]
128        u16lo = [0,0,0,0,0,0,0,0]
129       
130        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
131        u8lastbyte = u8.unibyte | u8lastscope
132        u16lo[2] = u8lastbyte & u8bit[2]
133        u16lo[3] = u8lastbyte & u8bit[3]
134        u16lo[4] = u8lastbyte & u8bit[4]
135        u16lo[5] = u8lastbyte & u8bit[5]
136        u16lo[6] = u8lastbyte & u8bit[6]
137        u16lo[7] = u8lastbyte & u8bit[7]
138        u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7]))
139        u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6])
140       
141        u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3])
142        u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4])
143        u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5])
144        u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4]))
145        u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5]))
146        u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6]))
147        u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7]))
148        u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2])
149
150        u8surrogate = u8.scope43 | u8.scope44
151        u16hi[0] = u16hi[0] | u8surrogate       
152        u16hi[1] = u16hi[1] | u8surrogate       
153        u16hi[3] = u16hi[3] | u8surrogate       
154        u16hi[4] = u16hi[4] | u8surrogate       
155        u16hi[5] = u16hi[5] | u8.scope44
156
157
158        s42lo1 = ~u8bit[3] # subtract 1
159        u16lo[1] = u16lo[1] | (u8.scope43 & bitutil.Advance(s42lo1))
160        s42lo0 = u8bit[2] ^ s42lo1 # borrow *
161        u16lo[0] = u16lo[0] | (u8.scope43 & bitutil.Advance(s42lo0))
162        borrow1 = s42lo1 & ~u8bit[2]
163        Advance_bit7 = bitutil.Advance(u8bit[7])
164        s42hi7 = Advance_bit7 ^ borrow1
165        u16hi[7]= u16hi[7] | (u8.scope43 & bitutil.Advance(s42hi7))
166        borrow2 = borrow1 & ~Advance_bit7
167        s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2
168        u16hi[6] = u16hi[6] | (u8.scope43 & bitutil.Advance(s42hi6))
169
170        u16lo[2] = u16lo[2] | (u8.scope43 & bitutil.Advance(u8bit[4]))
171        u16lo[3] = u16lo[3] | (u8.scope43 & bitutil.Advance(u8bit[5]))
172        u16lo[4] = u16lo[4] | (u8.scope43 & bitutil.Advance(u8bit[6]))
173        u16lo[5] = u16lo[5] | (u8.scope43 & bitutil.Advance(u8bit[7]))
174        u16lo[6] = u16lo[6] | (u8.scope43 & u8bit[2])
175        u16lo[7] = u16lo[7] | (u8.scope43 & u8bit[3])
176
177        delmask = u8.prefix | u8.scope32 | u8.scope42
178        return (u16hi, u16lo, delmask)
179
180
181
182#
183# Messages to duplicate u8u16 error reporting.
184#
185def IllegalSequenceMessage(pos):
186        return "Illegal UTF-8 sequence at position %i in source.\n" % pos
187
188def IncompleteSequenceMessage(pos):
189        return "EOF with incomplete UTF-8 sequence at position %i in source.\n" % pos
190
191
192import sys
193def main():
194
195        if len(sys.argv) < 2:
196                sys.stderr.write("Usage: u8u16.py u8file [u16file]\n")
197                exit
198        if len(sys.argv) == 3:
199                outfile = open(sys.argv[2],"w")
200        else: outfile = sys.stdout
201        u8data = readfile(sys.argv[1])
202        u8len = len(u8data)
203        (u8bit, EOF_mask) = bitutil.transpose_u8_byte_streams(u8data)
204        (u8, control, lex) = byteclass.classify_bytes(u8bit)
205        u8 = validate_utf8(u8)
206        if u8.error != 0:
207                err_pos = bitutil.count_leading_zeroes(u8.error)
208                at_EOF = err_pos == len(u8data)
209                if (err_pos >= 1) and ord(u8data[err_pos-1]) >= 0xC0: err_pos -= 1
210                elif err_pos >= 2 and ord(u8data[err_pos-2]) >= 0xE0: err_pos -= 2
211                elif err_pos >= 3 and ord(u8data[err_pos-3]) >= 0xF0: err_pos -= 3     
212                if at_EOF:
213                        sys.stderr.write(IncompleteSequenceMessage(err_pos))
214                else:
215                        sys.stderr.write(IllegalSequenceMessage(err_pos))
216                u8len = err_pos
217        (u16hi, u16lo, delmask) = u8u16(u8, u8bit)
218        U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
219        U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
220        U16final = bitutil.merge_bytes(U16H, U16L)
221        outfile.write(U16final)
222        outfile.close()
223               
224if __name__ == "__main__": main()
225
226
Note: See TracBrowser for help on using the repository browser.