source:proto/u16u8/u16u8_compilable.py@358

Last change on this file since 358 was 358, checked in by cameron, 9 years ago

u16u8 prototype

File size: 4.2 KB
Line
1#!/usr/bin/python
2#  u16u8_compilable.py
3#
4#  Python bit stream logic for u16u8
5#  Robert D. Cameron and Dan Lin
6#
7#  Feb. 7, 2010
8#
12#
13import bitutil
14
15#
16#
17#     Range              Code Point Bits           UTF-16              UTF-8
18#                                                u16h     u16l      u8_pre   butlast  u8_last
19#  000000-00007F     00000 00000000 0tuvwxyz   00000000 0tuvwxyz                     0tuvwxyz
20#
21#  000080-0007FF     00000 00000pqr stuvwxyz   00000pqr stuvwxyz            110pqrst 10uvwxyz
22#
23#  000800-00FFFF     00000 jklmnpqr stuvwxyz   jklmnpqr stuvwxyz   1110jklm 10npqrst 10uvwxyz
24#
25#  010000-10FFFF     efghi jklmnpqr stuvwxyz   110110ab cdjklmnp            11110efg 10hijklm
26#                                              110111qr stuvwxyz            10npqrst 10uvwxyz
27#                                           where abcd + 1 = efghi
28
30
31        # Surrogate pairs have 0xD8 through 0xDF as the high UTF-16 byte: 11011xxx pattern
32        u16.surrogate = (u16h[0] & u16h[1]) & (u16h[3] &~ u16h[2]) & u16h[4]
33        # The first of a surrogate pair is in the 0xD8-0xDB range, the second in 0xDC-0xDF
34        u16.hsurrogate = u16.surrogate &~ u16h[5]
35        u16.lsurrogate = u16.surrogate & u16h[5]
36        # If any of the high 5 bits are set, we are above the 07FF range
37        above_0x7FF =  ((u16h[0] | u16h[1]) | (u16h[2] | u16h[3])) | u16h[4]
38        above_0x7F = ((above_0x7FF | u16h[5]) | (u16h[6] | u16h[7])) | u16l[0]
39        u16.utf8_3  = above_0x7FF &~ u16.surrogate
40        u16.utf8_2 = above_0x7F & ~above_0x7FF
41        u16.ASCII = ~above_0x7F & mask
42        u16.error = bitutil.Advance(u16.hsurrogate) ^ u16.lsurrogate
43
44        u8_last = [0,0,0,0,0,0,0,0]
45        # u8_last corresponds to (a) 1st byte of a 1-byte sequence (u8unibyte),
46        # (b) 2nd byte of a 2-byte sequence (u8scope22),
47        # (c) 3rd byte of a 3-byte sequence (u8scope33),
48        # (d) 2nd byte of a 4-byte sequence (u8scope42) (from UTF-16 high surrogate)
49        # (e) 4th byte of a 4-byte sequence (u8scope44) (from UTF-16 low surrogate)
50
51        # efghi = abcd + 1 calculation using || bit streams
52        # Applies only for high surrogate position: u16.hsurrogate = 1
53        efghi_i = ~u16l[1]
54        efghi_h = u16l[0] ^ u16l[1]
55        hcarry = u16l[0] &~ efghi_h
56        efghi_g = u16h[7] ^ hcarry
57        gcarry = u16h[7] &~ efghi_g
58        efghi_f = u16h[6] ^ gcarry
59        efghi_e = u16h[6] &~ efghi_f
60
61        u8_last[0] = ~u16.ASCII
62        u8_last[1] = u16.ASCII & u16l[1]
63#       u8_last[2] = if u16.hsurrogate : efghi_h else: u16l[2]
64        u8_last[2] = u16l[2] &~ u16.hsurrogate | efghi_h & u16.hsurrogate
65        u8_last[3] = u16l[3] &~ u16.hsurrogate | efghi_i & u16.hsurrogate
66        u8_last[4] = u16l[4] &~ u16.hsurrogate | u16l[2] & u16.hsurrogate
67        u8_last[5] = u16l[5] &~ u16.hsurrogate | u16l[3] & u16.hsurrogate
68        u8_last[6] = u16l[6] &~ u16.hsurrogate | u16l[4] & u16.hsurrogate
69        u8_last[7] = u16l[7] &~ u16.hsurrogate | u16l[5] & u16.hsurrogate
70
71        u8_butlast = [0,0,0,0,0,0,0,0]
72        # u8_butlast corresponds to (a) 1st byte of a 2-byte sequence (u8prefix2),
73        # (b) 2nd byte of a 3-byte sequence (u8scope32),
74        # (c) 1st byte of a 4-byte sequence (u8prefix4) (from UTF-16 high surrogate)
75        # (d) 3rd byte of a 4-byte sequence (u8scope43) (from UTF-16 low surrogate)
76
77        u8_2or3 =  u16.utf8_2 | u16.utf8_3
78        u8_butlast[0] = ~u16.ASCII
79        u8_butlast[1] = u16.utf8_2 | u16.hsurrogate
80        u8_butlast[2] = u16.hsurrogate | (u16.utf8_3 & u16h[4]) | (u16.lsurrogate & bitutil.Advance(u16l[6]))
81        u8_butlast[3] = u16.hsurrogate | (u8_2or3 & u16h[5]) | (u16.lsurrogate & bitutil.Advance(u16l[7]))
82        u8_butlast[4] = u16h[6] &~ (u16.hsurrogate | u16.ASCII)
83        u8_butlast[5] = u16h[7] &~ (u16.hsurrogate | u16.ASCII) | efghi_e & u16.hsurrogate
84        u8_butlast[6] = u16l[0] &~ (u16.hsurrogate | u16.ASCII) | efghi_f & u16.hsurrogate
85        u8_butlast[7] = u16l[1] &~ (u16.hsurrogate | u16.ASCII) | efghi_g & u16.hsurrogate
86
87        u8_pre = [0,0,0,0,0,0,0,0]
88        u8_pre[0] = u16.utf8_3
89        u8_pre[1] = u16.utf8_3
90        u8_pre[2] = u16.utf8_3
91        u8_pre[3] = 0
92        u8_pre[4] = u16.utf8_3 & u16h[0]
93        u8_pre[5] = u16.utf8_3 & u16h[1]
94        u8_pre[6] = u16.utf8_3 & u16h[2]
95        u8_pre[7] = u16.utf8_3 & u16h[3]
96
97        return (u16, u8_pre, u8_butlast, u8_last)
98
Note: See TracBrowser for help on using the repository browser.