source: proto/charsetcompiler/if_hierarchy.py @ 4372

Last change on this file since 4372 was 4370, checked in by cameron, 5 years ago

Factor out if-hierarchy support

File size: 14.4 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from utf8_lib import *
10from pablo_expr import *
11from CC_compiler import *
12from UTF_encoding import *
13from charset_def import *
14
15
16# Generate a simplest possible test for a Unicode codepoint range
17# such that each 1 bit marks a position within a UTF-8 initial
18# subsequence such that each legal continuation of that subsequence
19# is within the range.  Return the generated variable.
20#
21# The test may be made up of up to three parts:
22# (a) a multibyte low-boundary test,
23# (b) a multibyte high-boundary test, and
24# (c) a range test.
25# It is possible that the low- and high- boundary tests have
26# a common multibyte prefix.
27def utf8_iftest_compiler(cgo, lo, hi):
28  lo_byte = utf8_byte(lo, 1)
29  hi_byte = utf8_byte(hi, 1)
30  targetVar = "cp_range_%x_%x" % (lo, hi)
31  return utf8_iftest_helper(cgo, lo, hi, 1, targetVar, TrueLiteral())
32
33def utf8_iftest_helper(cgo, lo, hi, byte_no, targetVar, marker):
34  lo_byte = utf8_byte(lo, byte_no)
35  hi_byte = utf8_byte(hi, byte_no)
36  at_lo_boundary = lo == 0 or utf8_byte(lo-1, byte_no) != lo_byte
37  at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi+1, byte_no) != hi_byte
38  if at_lo_boundary and at_hi_boundary:
39    if lo_byte == hi_byte:
40      byteVar = "byte_%x" % lo_byte
41    else:
42      if lo == 0x80: lo_byte = 0xC0
43      if hi == 0x10FFFF: hi_byte = 0xFF
44      byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
45    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
46    return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
47  elif lo_byte == hi_byte:
48    byteVar = "byte_%x" % lo_byte
49    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
50    new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
51    return utf8_iftest_helper(cgo, lo, hi, byte_no+1, targetVar, new_marker)
52  elif not at_hi_boundary:
53    hi1 = min_codepoint_with_common_bytes(hi, byte_no)
54    e1 = utf8_iftest_helper(cgo, lo, hi1-1, byte_no, targetVar, marker)
55    e2 = utf8_iftest_helper(cgo, hi1, hi, byte_no, targetVar, marker)
56    return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
57  else: # if at_hi_boundary:
58    lo1 = max_codepoint_with_common_bytes(lo, byte_no)
59    e1 = utf8_iftest_helper(cgo, lo, lo1, byte_no, targetVar, marker)
60    e2 = utf8_iftest_helper(cgo, lo1+1, hi, byte_no, targetVar, marker)
61    return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
62   
63def min_codepoint_with_common_bytes(cp, byte_no):
64  u8len = utf8_length(cp)
65  mask = (1 << (u8len-byte_no) * 6) - 1
66  lo_cp = cp &~ mask
67  if lo_cp == 0: return mask + 1
68  else: return lo_cp
69
70def max_codepoint_with_common_bytes(cp, byte_no):
71  u8len = utf8_length(cp)
72  mask = (1 << (u8len-byte_no) * 6) - 1
73  return cp | mask
74
75
76def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap, template_var):
77#   inner_code = []
78   (outer_lo, outer_hi) = enclosingRange
79   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
80   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
81   for rg in missingRanges:
82     (rglo, rghi) = rg
83     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap, template_var)
84   topRanges = outerRanges(enclosedRanges)
85   inner = innerRanges(enclosedRanges)
86   for rg in topRanges:
87     (rglo, rghi) = rg
88     empty_range = True
89     for k in charClassMap.keys():
90        if rangeIntersect(charClassMap[k], rglo, rghi) != []:
91           empty_range = False
92           break
93     if not empty_range:
94       range_var = utf8_iftest_compiler(cgo, rglo, rghi)
95       inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
96       inner_cgo.add_common_expressions(cgo)
97       generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap, template_var)
98       if inner_cgo.generated_code != []:
99         cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
100   return cgo.showcode()
101
102def generateCharClassSubDefs(cgo, lo, hi, charClassMap, template_var):
103   for k in charClassMap.keys():
104     targetVar = template_var % k
105     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
106     # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
107     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
108        (lo1, hi1) = byte_range
109        subcc2 = rangeIntersect(subcc1, lo1, hi1)
110        utf8_sequence_generator(subcc2, 1, targetVar, cgo)
111
112def rangeIntersect(ccList, lo, hi):
113    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
114
115def rangeGaps(ccList, lo, hi):
116    if lo >= hi: return []
117    if ccList == []: return [(lo, hi)]
118    (lo1, hi1) = ccList[0]
119    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
120    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
121    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
122    else: return []
123
124def outerRanges(ccList):
125    if len(ccList) <= 1: return ccList
126    (lo1, hi1) = ccList[0]
127    (lo2, hi2) = ccList[1]
128    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
129    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
130
131def innerRanges(ccList):
132    if len(ccList) <= 1: return []
133    (lo1, hi1) = ccList[0]
134    (lo2, hi2) = ccList[1]
135    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
136    else: return innerRanges(ccList[1:])
137
138
139
140def generateCharClassDefs(ifRangeList, charClassMap, template_var):
141   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
142   for k in charClassMap.keys():
143     cgo.add_assignment(template_var % k, '0')
144   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap, template_var)
145   return cgo.showcode()
146 
147
148#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
149
150#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
151
152
153defaultIfRangeList = [
154#Non-ASCII
155(0x80,0x10FFFF), 
156#Two-byte sequences
157(0x80,0x7FF), 
158(0x100, 0x3FF), 
159#0100..017F; Latin Extended-A
160#0180..024F; Latin Extended-B
161#0250..02AF; IPA Extensions
162#02B0..02FF; Spacing Modifier Letters
163(0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
164#0300..036F; Combining Diacritical Marks
165#0370..03FF; Greek and Coptic
166(0x300, 0x36F), (0x370, 0x3FF),
167#0400..04FF; Cyrillic
168#0500..052F; Cyrillic Supplement
169#0530..058F; Armenian
170#0590..05FF; Hebrew
171#0600..06FF; Arabic
172(0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
173#0700..074F; Syriac
174#0750..077F; Arabic Supplement
175#0780..07BF; Thaana
176#07C0..07FF; NKo
177(0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF), 
178#Three-byte sequences
179(0x800, 0xFFFF),
180(0x800, 0x4DFF),
181(0x800, 0x1FFF),
182(0x800, 0x0FFF),
183#0800..083F; Samaritan
184#0840..085F; Mandaic
185#08A0..08FF; Arabic Extended-A
186#0900..097F; Devanagari
187#0980..09FF; Bengali
188#0A00..0A7F; Gurmukhi
189#0A80..0AFF; Gujarati
190#0B00..0B7F; Oriya
191#0B80..0BFF; Tamil
192#0C00..0C7F; Telugu
193#0C80..0CFF; Kannada
194#0D00..0D7F; Malayalam
195#0D80..0DFF; Sinhala
196#0E00..0E7F; Thai
197#0E80..0EFF; Lao
198#0F00..0FFF; Tibetan
199(0x1000, 0x1FFF),
200#1000..109F; Myanmar
201#10A0..10FF; Georgian
202#1100..11FF; Hangul Jamo
203#1200..137F; Ethiopic
204#1380..139F; Ethiopic Supplement
205#13A0..13FF; Cherokee
206#1400..167F; Unified Canadian Aboriginal Syllabics
207#1680..169F; Ogham
208#16A0..16FF; Runic
209#1700..171F; Tagalog
210#1720..173F; Hanunoo
211#1740..175F; Buhid
212#1760..177F; Tagbanwa
213#1780..17FF; Khmer
214#1800..18AF; Mongolian
215#18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
216#1900..194F; Limbu
217#1950..197F; Tai Le
218#1980..19DF; New Tai Lue
219#19E0..19FF; Khmer Symbols
220#1A00..1A1F; Buginese
221#1A20..1AAF; Tai Tham
222#1AB0..1AFF; Combining Diacritical Marks Extended
223#1B00..1B7F; Balinese
224#1B80..1BBF; Sundanese
225#1BC0..1BFF; Batak
226#1C00..1C4F; Lepcha
227#1C50..1C7F; Ol Chiki
228#1CC0..1CCF; Sundanese Supplement
229#1CD0..1CFF; Vedic Extensions
230#1D00..1D7F; Phonetic Extensions
231#1D80..1DBF; Phonetic Extensions Supplement
232#1DC0..1DFF; Combining Diacritical Marks Supplement
233#1E00..1EFF; Latin Extended Additional
234#1F00..1FFF; Greek Extended
235(0x2000, 0x4DFF),(0x2000, 0x2FFF),
236(0x3000, 0x4DFF),
237(0x4E00,0x9FFF),
238#4E00..9FFF; CJK Unified Ideographs
239(0xA000,0xFFFF),
240
241(0x10000, 0x10FFFF)]
242
243
244#
245# Partition a list of ranges into a minimum set of utf8 groups
246# UTF-8 prefix groups, where a group is
247# (a) a range of codepoints with UTF-8 prefixes of the same length
248#     such that every codepoint in the range is within the group, or
249# (b) a sublist all having the same UTF-8 initial
250#     byte
251def partition_by_UTF8_group(range_list, byte_no):
252    if range_list == []: return []
253    (lo, hi) = range_list[0]
254    u8len_lo = utf8_length(lo)
255    u8len_hi = utf8_length(hi)
256    if u8len_lo != u8len_hi:
257        mid = max_codepoint_of_length(u8len_lo)
258        return partition_by_UTF8_group([(lo, mid), (mid+1, hi)] + range_list[1:], byte_no)
259    lobyte1 = utf8_byte(lo, byte_no)
260    hibyte1 = utf8_byte(hi, byte_no)
261    if lobyte1 != hibyte1:
262        if not is_low_codepoint_after_byte(lo, byte_no):
263            lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
264            #print "lo--lo1:  %x--%x" % (lo, lo1)
265            return [[(lo, lo1)]] + partition_by_UTF8_group([(lo1+1, hi)] + range_list[1:], byte_no)
266        elif not is_high_codepoint_after_byte(hi, byte_no):
267            hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
268            #print "lo--hi-1:  %x--%x" % (lo, hi1-1)
269            return [[(lo, hi1-1)]] + partition_by_UTF8_group([(hi1, hi)] + range_list[1:], byte_no)
270        else:
271            # we have a prefix group of type (a)
272            return [[(lo, hi)]] + partition_by_UTF8_group(range_list[1:], byte_no)
273    group1 = [(lo, hi)]
274    subpartitions = partition_by_UTF8_group(range_list[1:], byte_no)
275    if subpartitions == []: return [group1]
276    elif utf8_byte(subpartitions[0][0][0], byte_no) == lobyte1:
277        return [group1 + subpartitions[0]] + subpartitions[1:]
278    else:
279        return [group1] + subpartitions
280
281# Ensure the sequence of preceding bytes is defined, up to, but
282# not including the given byte_no
283def ensure_preceding_prefix_defined(codepoint, byte_no, cgo):
284   for i in range(1, byte_no):
285      byte_i = utf8_byte(codepoint, i)
286      byteVar = "byte_%x" % byte_i
287      cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(byte_i, byte_i)]))
288      if i > 1:
289         pfx1 = utf8_prefix_var(codepoint, i-1)
290         pfx1_adv = pfx1 + "_adv"
291         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
292         pfx2 = utf8_prefix_var(codepoint, i)
293         cgo.add_canonical_assignment(pfx2, cgo.expr2py(make_and(Var(pfx1_adv), Var(byteVar))))
294
295
296#
297# Generate remaining code to match UTF-8 code sequences within
298# the codepoint set u8_partition, assuming that the code matching the
299# sequences up to byte number byte_no have been generated.
300#
301def utf8_sequence_generator(u8_partition, byte_no, targetVar, cgo):
302   if len(u8_partition) == 0: return
303   (lo, hi) = u8_partition[0]
304   if utf8_length(lo) == byte_no:
305      # We have a single byte remaining to match for all codepoints
306      # in this partition.  Use the byte class compiler to generate
307      # matches for these codepoints.
308      ensure_preceding_prefix_defined(lo, byte_no, cgo)
309      byte_pair_list = byte_definitions(u8_partition, byte_no)
310      #print byte_pair_list
311      if len(byte_pair_list) == 1:
312          (lobyte, hibyte) = byte_pair_list[0]
313          if lo == hi:
314              final_byte_var = "byte_%x" % lobyte
315          else:
316              final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
317          cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
318      else:
319          hi = u8_partition[-1][0]
320          final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
321          cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
322      test_expr = Var(final_byte_var)
323      if byte_no > 1: 
324         pfx1 = utf8_prefix_var(lo, byte_no-1)
325         pfx1_adv = pfx1 + "_adv"
326         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
327         test_expr = make_and(Var(pfx1_adv), test_expr)
328      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
329   else:
330     partitions = partition_by_UTF8_group(u8_partition, byte_no)
331     for p in partitions:
332       (lo, hi) = p[0]
333       lbyte = utf8_byte(lo, byte_no)
334       hbyte = utf8_byte(hi, byte_no)
335       ensure_preceding_prefix_defined(lo, byte_no, cgo)
336       if lbyte == hbyte:
337         byteVar = "byte_%x" % lbyte
338         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
339         if byte_no > 1:
340           last_prefix = utf8_prefix_var(lo, byte_no - 1)
341           this_prefix = utf8_prefix_var(lo, byte_no)
342           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
343         if byte_no < utf8_length(lo): utf8_sequence_generator(p, byte_no+1, targetVar, cgo)
344       else:
345         byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
346         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
347         if byte_no > 1:
348           last_prefix = utf8_prefix_var(lo, byte_no - 1)
349           this_prefix = last_prefix + "_" + byteVar
350           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
351         else: this_prefix = byteVar
352         suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
353         cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
354         last_prefix = this_prefix
355         while byte_no < utf8_length(lo):
356           byte_no += 1
357           this_prefix = last_prefix + "_sfx"
358           cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(suffixVar))))
359           last_prefix = this_prefix
360         cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
361
362
363
364def utf8_prefix_var(codepoint, prefix_bytes):
365   if prefix_bytes == 0: 
366      raise Exception ("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
367   elif prefix_bytes == 1:
368      return "byte_%x" % utf8_byte(codepoint, 1)
369   else: 
370      return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n+1) for n in range(prefix_bytes)])
371
372
373def byte_definitions(range_list, n):
374   #print ["%x--%x" % (p[0], p[1]) for p in range_list]
375   result = [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
376   #print ["%x--%x" % (p[0], p[1]) for p in result]
377   return result
378
379
380
381
382
Note: See TracBrowser for help on using the repository browser.