source: proto/RE/doc/UTF8_class.py @ 3433

Last change on this file since 3433 was 3433, checked in by cameron, 6 years ago

fix

File size: 3.2 KB
Line 
1#
2# Prototype for computing UTF8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9#
10# Definitions for debugging/prototyping
11def ByteClassCompiler(lbyte): return "\\x%x" % lbyte
12def ByteRangeCompiler(lbyte, hbyte): return "[\\x%x-\\x%x]" % (lbyte, hbyte)
13def make_or(e1, e2): return "(%s | %s)" % (e1, e2)
14def make_and(e1, e2): return "(%s & %s)" % (e1, e2)
15def make_shift_forward(e, n): return "(%s >> %i)" % (e, n)
16
17#
18def UTF8_length(codepoint):
19   if codepoint <= 0x7F: return 1
20   elif codepoint <= 0x7FF: return 2
21   elif codepoint <= 0xFFFF: return 3
22   else: return 4
23
24def UTF8_byte(codepoint, n):
25   len = UTF8_length(codepoint)
26   if n == 1:
27     if len == 1: return codepoint
28     elif len == 2: return 0xC0 + (codepoint >> 6) 
29     elif len == 3: return 0xE0 + (codepoint >> 12) 
30     elif len == 4: return 0xF0 + (codepoint >> 18) 
31   else:
32     bits = (codepoint >> (6 * (len - n))) & 0x3F
33     return 0x80 + bits
34
35def max_codepoint_of_length(n):
36   if n == 1: return 0x7F
37   elif n == 2: return 0x7FF
38   elif n == 3: return 0xFFFF
39   else: return 0x10FFFF
40
41
42def matched_sequence_compiler(lo, hi, n, hlen):
43   """ Helper function to generate the code necessary to match bytes
44       n through hlen (1-based indexing) of the range of UTF-8 sequences
45       for codepoints lo through hi. """
46   hbyte = UTF8_byte(hi, n)
47   lbyte = UTF8_byte(lo, n)
48   if n == hlen: return ByteRangeCompiler(lbyte, hbyte)
49   #
50   # One or more bytes of the lower and upper bound may be the same.
51   # Build a sequence of byte tests.
52   if hbyte == lbyte:
53     test_n = ByteClassCompiler(hbyte)
54     return make_and(make_shift_forward(test_n, 1), matched_sequence_compiler(lo, hi, n+1, hlen))
55   # We now have a range involving different bytes at position n.
56   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
57   # A separate test may be needed for the high byte sequence if
58   # there are constraints on following suffix bytes.
59   if hi & following_suffix_mask != following_suffix_mask:
60     hi_floor = hi &~following_suffix_mask
61     return make_or(matched_sequence_compiler(hi_floor, hi, n, hlen),
62                    matched_sequence_compiler(lo, hi_floor - 1, n, hlen))
63   # A separate test may be needed for the low byte sequence if
64   # there are constraints on following suffix bytes.
65   if lo & following_suffix_mask != 0:
66     low_ceil = lo | following_suffix_mask
67     return make_or(matched_sequence_compiler(low_ceil + 1, hi, n, hlen),
68                    matched_sequence_compiler(lo, low_ceil, n, hlen))
69   #
70   # Now we have a range that permits all suffix combinations.
71   # We don't have to test suffixes UNDER THE ASSUMPTION that the UTF-8
72   # has been validated.
73   return make_shift_forward(ByteRangeCompiler(lbyte, hbyte), hlen - n)
74
75def UTF8_range_compiler(lo, hi):
76   hlen = UTF8_length(hi)
77   # If different length code unit sequences are involved, make
78   # a union of equilength subranges.
79   if hlen > UTF8_length(lo):
80     m = max_codepoint_of_length(hlen - 1)
81     return make_or(UTF8_range_compiler(lo, m), UTF8_range_compiler(m+1, hi))
82   #
83   return matched_sequence_compiler(lo, hi, 1, hlen)
84
85
Note: See TracBrowser for help on using the repository browser.