source: proto/charsetcompiler/utf8_lib.py @ 4423

Last change on this file since 4423 was 4223, checked in by cameron, 5 years ago

More functions for utf8 lib

File size: 2.2 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9#
10def utf8_length(codepoint):
11   if codepoint <= 0x7F: return 1
12   elif codepoint <= 0x7FF: return 2
13   elif codepoint <= 0xFFFF: return 3
14   else: return 4
15
16def utf8_byte(codepoint, n):
17   lgth = utf8_length(codepoint)
18   if n == 1:
19     if lgth == 1: return codepoint
20     elif lgth == 2: return 0xC0 | (codepoint >> 6) 
21     elif lgth == 3: return 0xE0 | (codepoint >> 12) 
22     elif lgth == 4: return 0xF0 | (codepoint >> 18) 
23   else:
24     bits = (codepoint >> (6 * (lgth - n))) & 0x3F
25     return 0x80 | bits
26
27def max_codepoint_of_length(n):
28   if n == 1: return 0x7F
29   elif n == 2: return 0x7FF
30   elif n == 3: return 0xFFFF
31   else: return 0x10FFFF
32
33def max_codepoint_with_initial_byte(byte):
34   if byte <= 0x7F: return 0x7F
35   elif byte <= 0xDF: return ((byte & 0x1F) <<6) | 0x3F
36   elif byte == 0xED: return 0xD7FF
37   elif byte <= 0xEF: return ((byte & 0x0F) <<12) | 0xFFF
38   elif byte == 0xF4: return 0x10FFFF
39   else: return ((byte & 0x07) <<18) | 0x3FFFF
40
41def min_codepoint_with_initial_byte(byte):
42   if byte <= 0x7F: return 0
43   elif byte <= 0xDF: return ((byte & 0x1F) <<6)
44   elif byte == 0xE0: return 0x1000
45   elif byte <= 0xEF: return ((byte & 0x0F) <<12)
46   elif byte == 0xF0: return 0x10000
47   else: return ((byte & 0x07) <<18)
48
49#
50# Given two codepoints lo, hi: return the number of
51# leading UTF-8 bytes that their respective UTF-8
52# representations have in common.
53def common_utf8_leading_bytes(lo, hi):
54   u8len_lo = utf8_length(lo)
55   u8len_hi = utf8_length(hi)
56   if u8len_lo != u8len_hi: return 0
57   remaining = u8len_lo
58   while remaining > 0:
59     if lo == hi: return remaining
60     lo >>= 6
61     hi >>= 6
62     remaining -= 1
63   return 0
64
65
66#
67def is_low_codepoint_after_byte(codepoint, byte):
68    for i in range(byte, utf8_length(codepoint)):
69        if utf8_byte(codepoint, i+1) != 0x80: return False
70    return True
71
72def is_high_codepoint_after_byte(codepoint, byte):
73    for i in range(byte, utf8_length(codepoint)):
74        if utf8_byte(codepoint, i+1) != 0xBF: return False
75    return True
76
77
Note: See TracBrowser for help on using the repository browser.