source: proto/charsetcompiler/transcode_compiler.py @ 5435

Last change on this file since 5435 was 2233, checked in by cameron, 7 years ago

Identify illegal characters in character sets

File size: 3.4 KB
Line 
1#
2#  Robert D. Cameron
3#  July 1, 2012
4#
5# Transcoding Character Class Generator
6#
7# 1. Legacy 8-bit Character Set to UCS2 Decoding
8#
9# The transcoding problems in this class are simple
10# transformations that map a sequence of byte values (0x00-0xFF)
11# to corresponding series of Unicode codepoints confined
12# to the UCS2 range (0x0000-0xFFFF).  Given a source character
13# C, we represent the Unicode codepoint value as UCS2(C).
14#
15# In general, these transcoders can be implemented using
16# character classes for each output bit of the UTF 16
17# representation.
18#
19# For the high 8-bits u16hi, we define character classes
20# for x2u16hi.bit_0 through x2u16hi.bit_7 such that a character C
21# is in the class x2u16hi.bit_n in iff (UCS2(C) >> (15-n)) & 1 = 1.
22#
23# For the low 8-bits u16lo, we define character classes
24# for x2u16lo.bit_0 through x2u16lo.bit_7 such that a character C
25# is in the class x2u16lo.bit_n in iff ((C ^ UCS2(C)) >> (7-n)) & 1 = 1.
26#
27
28import codecs, sys, string
29#
30#
31def Legacy_8bit_To_UCS2_Table(charset_name):
32        decoder = codecs.lookup(charset_name)
33        table = []
34        illegal = []
35        for char_val in range(0, 256):
36                try:
37                        (uch, ulen) = decoder.decode(chr(char_val))
38                        if ulen !=1: raise UnicodeError()
39                        table.append(ord(uch))
40                except:
41                        illegal.append(char_val)
42                        table.append(0)
43        return (table, illegal)
44
45               
46def UCS2_Table_To_u16hi_bit(tbl, bitno):
47        charvals_in_class = [v for v in range(0, 256) if (tbl[v] >> (15 - bitno)) & 1 == 1]
48        items = []
49        if charvals_in_class == []: return items
50        range0 = charvals_in_class[0]
51        range_pending = range0
52        for v in charvals_in_class[1:]:
53                if v == range_pending + 1: range_pending += 1
54                else:
55                        if range0 == range_pending: items.append(chr(range0))
56                        else: items.append('%s-%s' % (chr(range0), chr(range_pending)))
57                        range0 = v
58                        range_pending = range0
59        if range0 == range_pending: items.append(chr(range0))
60        else: items.append('%s-%s' % (chr(range0), chr(range_pending)))
61        return items
62
63def UCS2_Table_To_u16lo_bit(tbl, bitno):
64        charvals_in_class = [v for v in range(0, 256) if ((v ^ tbl[v]) >> (7 - bitno)) & 1 == 1]
65        items = []
66        if charvals_in_class == []: return items
67        range0 = charvals_in_class[0]
68        range_pending = range0
69        for v in charvals_in_class[1:]:
70                if v == range_pending + 1: range_pending += 1
71                else:
72                        if range0 == range_pending: items.append(chr(range0))
73                        else: items.append('%s-%s' % (chr(range0), chr(range_pending)))
74                        range0 = v
75                        range_pending = range0
76        if range0 == range_pending: items.append(chr(range0))
77        else: items.append('%s-%s' % (chr(range0), chr(range_pending)))
78        return items
79
80def WriteLegacy_8bit_CharDefs(charset_name, f):
81        defs = "# %s to UTF-16 decoding equations\n" % charset_name
82        (tbl, illegal) = Legacy_8bit_To_UCS2_Table(charset_name)
83        if len(illegal) > 0:
84                bad = [chr(x) for x in illegal]
85                defs += "illegal = %s\n" % string.join(bad, "").encode('string-escape')
86        for bit in range(0, 8):
87                cs = UCS2_Table_To_u16hi_bit(tbl, bit)
88                defs += "u16hi_bit_%i = [%s]\n" % (bit, string.join(cs, "").encode('string-escape'))
89        for bit in range(0, 8):
90                cs = UCS2_Table_To_u16lo_bit(tbl, bit)
91                defs += "x16lo_bit_%i = [%s]\n" % (bit, string.join(cs, "").encode('string-escape'))
92        f.write(defs)
93
94if __name__ == "__main__":
95        args = sys.argv[1:]
96        if len(args) == 1:
97                WriteLegacy_8bit_CharDefs(args[0], sys.stdout)
98        elif len(args) == 2:
99                f = open(args[1], "w")
100                WriteLegacy_8bit_CharDefs(args[0], f)
101                f.close()
102
103
Note: See TracBrowser for help on using the repository browser.