source: proto/RE/doc/UTF8_class.py @ 3250

Last change on this file since 3250 was 3250, checked in by cameron, 6 years ago

Prototype for UTF8 character classes

File size: 3.0 KB
Line 
1#
2# Prototype for computing UTF8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9#
10# Definitions for debugging/prototyping
11def ByteClassCompiler(lbyte): return "%x" % lbyte
12def ByteRangeCompiler(lbyte, hbyte): return "[%x-%x]" % (lbyte, hbyte)
13def make_or(e1, e2): return "(%s | %s)" % (e1, e2)
14def make_and(e1, e2): return "(%s & %s)" % (e1, e2)
15def make_shift_forward(e, n): return "(%s >> %i)" % (e, n)
16
17#
18def UTF8_length(codepoint):
19   if codepoint <= 0x7F: return 1
20   elif codepoint <= 0x7FF: return 2
21   elif codepoint <= 0xFFFF: return 3
22   else: return 4
23
24def UTF8_byte(codepoint, n):
25   len = UTF8_length(codepoint)
26   if n == 1:
27     if len == 1: return codepoint
28     elif len == 2: return 0xC0 + (codepoint >> 6) 
29     elif len == 3: return 0xE0 + (codepoint >> 12) 
30     elif len == 4: return 0xF0 + (codepoint >> 18) 
31   else:
32     bits = (codepoint >> (6 * (len - n))) & 0x3F
33     return 0x80 + bits
34
35def max_codepoint_of_length(n):
36   if n == 1: return 0x7F
37   elif n == 2: return 0x7FF
38   elif n == 3: return 0xFFFF
39   else: return 0x10FFFF
40
41def UTF8_range_compiler(lo, hi):
42   hlen = UTF8_length(hi)
43   # If different length code unit sequences are involved, make
44   # a union of equilength subranges.
45   if hlen > UTF8_length(lo):
46     m = max_codepoint_of_length(hlen - 1)
47     return make_or(UTF8_range_compiler(lo, m), UTF8_range_compiler(m+1, hi))
48   #
49   return matched_sequence_compiler(lo, hi, 1, hlen)
50
51def matched_sequence_compiler(lo, hi, n, hlen):
52   hbyte = UTF8_byte(hi, n)
53   lbyte = UTF8_byte(lo, n)
54   if n == hlen: return ByteRangeCompiler(lbyte, hbyte)
55   #
56   # One or more bytes of the lower and upper bound may be the same.
57   # Build a sequence of byte tests.
58   if hbyte == lbyte:
59     test_n = ByteClassCompiler(hbyte)
60     return make_and(make_shift_forward(test_n, 1), matched_sequence_compiler(lo, hi, n+1, hlen))
61   # We now have a range involving different bytes at position n.
62   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
63   # A separate test may be needed for the high byte sequence if
64   # there are constraints on following suffix bytes.
65   if hi & following_suffix_mask != following_suffix_mask:
66     hi_floor = hi &~following_suffix_mask
67     return make_or(matched_sequence_compiler(hi_floor, hi, n, hlen),
68                    matched_sequence_compiler(lo, hi_floor - 1, n, hlen))
69   # A separate test may be needed for the low byte sequence if
70   # there are constraints on following suffix bytes.
71   if lo & following_suffix_mask != 0:
72     low_ceil = lo | following_suffix_mask
73     return make_or(matched_sequence_compiler(low_ceil + 1, hi, n, hlen),
74                    matched_sequence_compiler(lo, low_ceil, n, hlen))
75   #
76   # Now we have a range that permits all suffix combinations.
77   # We don't have to test suffixes UNDER THE ASSUMPTION that the UTF-8
78   # has been validated.
79   return make_shift_forward(ByteRangeCompiler(lbyte, hbyte), hlen - n)
80
81
Note: See TracBrowser for help on using the repository browser.