source: proto/charsetcompiler/if_hierarchy.py @ 4424

Last change on this file since 4424 was 4424, checked in by cameron, 4 years ago

Restructure/bug fix for if-hierarchy generation

File size: 13.9 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from utf8_lib import *
10from pablo_expr import *
11from CC_compiler import *
12from UTF_encoding import *
13from charset_def import *
14
15
16# Generate a simplest possible test for a Unicode codepoint range
17# such that each 1 bit marks a position within a UTF-8 initial
18# subsequence such that each legal continuation of that subsequence
19# is within the range.  Return the generated variable.
20#
21# The test may be made up of up to three parts:
22# (a) a multibyte low-boundary test,
23# (b) a multibyte high-boundary test, and
24# (c) a range test.
25# It is possible that the low- and high- boundary tests have
26# a common multibyte prefix.
27def utf8_iftest_compiler(cgo, lo, hi):
28  lo_byte = utf8_byte(lo, 1)
29  hi_byte = utf8_byte(hi, 1)
30  targetVar = "cp_range_%x_%x" % (lo, hi)
31  return utf8_iftest_helper(cgo, lo, hi, 1, targetVar, TrueLiteral())
32
33def utf8_iftest_helper(cgo, lo, hi, byte_no, targetVar, marker):
34  lo_byte = utf8_byte(lo, byte_no)
35  hi_byte = utf8_byte(hi, byte_no)
36  at_lo_boundary = lo == 0 or utf8_byte(lo-1, byte_no) != lo_byte
37  at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi+1, byte_no) != hi_byte
38  if at_lo_boundary and at_hi_boundary:
39    if lo_byte == hi_byte:
40      byteVar = "byte_%x" % lo_byte
41    else:
42      if lo == 0x80: lo_byte = 0xC0
43      if hi == 0x10FFFF: hi_byte = 0xFF
44      byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
45    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
46    return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
47  elif lo_byte == hi_byte:
48    byteVar = "byte_%x" % lo_byte
49    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
50    new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
51    return utf8_iftest_helper(cgo, lo, hi, byte_no+1, targetVar, new_marker)
52  elif not at_hi_boundary:
53    hi1 = min_codepoint_with_common_bytes(hi, byte_no)
54    e1 = utf8_iftest_helper(cgo, lo, hi1-1, byte_no, targetVar, marker)
55    e2 = utf8_iftest_helper(cgo, hi1, hi, byte_no, targetVar, marker)
56    return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
57  else: # if at_hi_boundary:
58    lo1 = max_codepoint_with_common_bytes(lo, byte_no)
59    e1 = utf8_iftest_helper(cgo, lo, lo1, byte_no, targetVar, marker)
60    e2 = utf8_iftest_helper(cgo, lo1+1, hi, byte_no, targetVar, marker)
61    return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
62   
63def min_codepoint_with_common_bytes(cp, byte_no):
64  u8len = utf8_length(cp)
65  mask = (1 << (u8len-byte_no) * 6) - 1
66  lo_cp = cp &~ mask
67  if lo_cp == 0: return mask + 1
68  else: return lo_cp
69
70def max_codepoint_with_common_bytes(cp, byte_no):
71  u8len = utf8_length(cp)
72  mask = (1 << (u8len-byte_no) * 6) - 1
73  return cp | mask
74
75
76def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap, template_var):
77#   inner_code = []
78   (outer_lo, outer_hi) = enclosingRange
79   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
80   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
81   # Codepoints in unenclosed ranges will be computed unconditionally.
82   # Generate them first so that computed subexpressions may be shared
83   # with calculations within the if hierarchy.
84   for rg in missingRanges:
85     (rglo, rghi) = rg
86     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap, template_var)
87   topRanges = outerRanges(enclosedRanges)
88   inner = innerRanges(enclosedRanges)
89   for rg in topRanges:
90     (rglo, rghi) = rg
91     empty_range = True
92     for k in charClassMap.keys():
93        if rangeIntersect(charClassMap[k], rglo, rghi) != []:
94           empty_range = False
95           break
96     if not empty_range:
97       range_var = utf8_iftest_compiler(cgo, rglo, rghi)
98       inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
99       inner_cgo.add_common_expressions(cgo)
100       generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap, template_var)
101       if inner_cgo.generated_code != []:
102         cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
103   return cgo.showcode()
104
105def generateCharClassSubDefs(cgo, lo, hi, charClassMap, template_var):
106   for k in charClassMap.keys():
107     targetVar = template_var % k
108     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
109     # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
110     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xD800, 0xDFFF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
111        (lo1, hi1) = byte_range
112        subcc2 = rangeIntersect(subcc1, lo1, hi1)
113        utf8_sequence_generator(subcc2, 1, targetVar, cgo)
114
115def rangeIntersect(ccList, lo, hi):
116    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
117
118def rangeGaps(ccList, lo, hi):
119    if lo >= hi: return []
120    if ccList == []: return [(lo, hi)]
121    (lo1, hi1) = ccList[0]
122    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
123    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
124    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
125    else: return []
126
127def outerRanges(ccList):
128    if len(ccList) <= 1: return ccList
129    (lo1, hi1) = ccList[0]
130    (lo2, hi2) = ccList[1]
131    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
132    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
133
134def innerRanges(ccList):
135    if len(ccList) <= 1: return []
136    (lo1, hi1) = ccList[0]
137    (lo2, hi2) = ccList[1]
138    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
139    else: return innerRanges(ccList[1:])
140
141
142
143def generateCharClassDefs(ifRangeList, charClassMap, template_var):
144   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
145   for k in charClassMap.keys():
146     cgo.add_assignment(template_var % k, '0')
147   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap, template_var)
148   return cgo.showcode()
149 
150
151#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
152
153#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
154
155
156defaultIfRangeList = [
157#Non-ASCII
158(0x80,0x10FFFF), 
159#Two-byte sequences
160(0x80,0x7FF), 
161(0x100, 0x3FF), 
162#0100..017F; Latin Extended-A
163#0180..024F; Latin Extended-B
164#0250..02AF; IPA Extensions
165#02B0..02FF; Spacing Modifier Letters
166(0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
167#0300..036F; Combining Diacritical Marks
168#0370..03FF; Greek and Coptic
169(0x300, 0x36F), (0x370, 0x3FF),
170#0400..04FF; Cyrillic
171#0500..052F; Cyrillic Supplement
172#0530..058F; Armenian
173#0590..05FF; Hebrew
174#0600..06FF; Arabic
175(0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
176#0700..074F; Syriac
177#0750..077F; Arabic Supplement
178#0780..07BF; Thaana
179#07C0..07FF; NKo
180(0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF), 
181#Three-byte sequences
182(0x800, 0xFFFF),
183(0x800, 0x4DFF),
184(0x800, 0x1FFF),
185(0x800, 0x0FFF),
186#0800..083F; Samaritan
187#0840..085F; Mandaic
188#08A0..08FF; Arabic Extended-A
189#0900..097F; Devanagari
190#0980..09FF; Bengali
191#0A00..0A7F; Gurmukhi
192#0A80..0AFF; Gujarati
193#0B00..0B7F; Oriya
194#0B80..0BFF; Tamil
195#0C00..0C7F; Telugu
196#0C80..0CFF; Kannada
197#0D00..0D7F; Malayalam
198#0D80..0DFF; Sinhala
199#0E00..0E7F; Thai
200#0E80..0EFF; Lao
201#0F00..0FFF; Tibetan
202(0x1000, 0x1FFF),
203#1000..109F; Myanmar
204#10A0..10FF; Georgian
205#1100..11FF; Hangul Jamo
206#1200..137F; Ethiopic
207#1380..139F; Ethiopic Supplement
208#13A0..13FF; Cherokee
209#1400..167F; Unified Canadian Aboriginal Syllabics
210#1680..169F; Ogham
211#16A0..16FF; Runic
212#1700..171F; Tagalog
213#1720..173F; Hanunoo
214#1740..175F; Buhid
215#1760..177F; Tagbanwa
216#1780..17FF; Khmer
217#1800..18AF; Mongolian
218#18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
219#1900..194F; Limbu
220#1950..197F; Tai Le
221#1980..19DF; New Tai Lue
222#19E0..19FF; Khmer Symbols
223#1A00..1A1F; Buginese
224#1A20..1AAF; Tai Tham
225#1AB0..1AFF; Combining Diacritical Marks Extended
226#1B00..1B7F; Balinese
227#1B80..1BBF; Sundanese
228#1BC0..1BFF; Batak
229#1C00..1C4F; Lepcha
230#1C50..1C7F; Ol Chiki
231#1CC0..1CCF; Sundanese Supplement
232#1CD0..1CFF; Vedic Extensions
233#1D00..1D7F; Phonetic Extensions
234#1D80..1DBF; Phonetic Extensions Supplement
235#1DC0..1DFF; Combining Diacritical Marks Supplement
236#1E00..1EFF; Latin Extended Additional
237#1F00..1FFF; Greek Extended
238(0x2000, 0x4DFF),(0x2000, 0x2FFF),
239(0x3000, 0x4DFF),
240(0x4E00,0x9FFF),
241#4E00..9FFF; CJK Unified Ideographs
242(0xA000,0xFFFF),
243
244(0x10000, 0x10FFFF)]
245
246
247
248# Ensure the sequence of preceding bytes is defined, up to, but
249# not including the given byte_no
250def ensure_preceding_prefix_defined(codepoint, byte_no, cgo):
251   for i in range(1, byte_no):
252      byte_i = utf8_byte(codepoint, i)
253      byteVar = "byte_%x" % byte_i
254      cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(byte_i, byte_i)]))
255      if i > 1:
256         pfx1 = utf8_prefix_var(codepoint, i-1)
257         pfx1_adv = pfx1 + "_adv"
258         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
259         pfx2 = utf8_prefix_var(codepoint, i)
260         cgo.add_canonical_assignment(pfx2, cgo.expr2py(make_and(Var(pfx1_adv), Var(byteVar))))
261
262
263
264
265#
266# Generate remaining code to match UTF-8 code sequences within
267# the codepoint set cpset, assuming that the code matching the
268# sequences up to byte number byte_no have been generated.
269#
270def utf8_sequence_generator(cpset, byte_no, targetVar, cgo):
271    if len(cpset) == 0: return
272    (lo, hi) = cpset[0]
273    u8len_lo = utf8_length(lo)
274    u8len_max = utf8_length(cpset[-1][1])
275    if u8len_lo != u8len_max:
276        mid = max_codepoint_of_length(u8len_lo)
277        utf8_sequence_generator(range_intersect(cpset, lo, mid), byte_no)
278        utf8_sequence_generator(range_intersect(cpset, mid+1, hi1), byte_no)
279        return
280    if u8len_lo == byte_no:
281        # We have a single byte remaining to match for all codepoints
282        # in this cpset.  Use the byte class compiler to generate
283        # matches for these codepoints.
284        ensure_preceding_prefix_defined(lo, byte_no, cgo)
285        byte_pair_list = byte_definitions(cpset, byte_no)
286        #print byte_pair_list
287        if len(byte_pair_list) == 1:
288            (lobyte, hibyte) = byte_pair_list[0]
289            if lo == hi:
290                final_byte_var = "byte_%x" % lobyte
291            else:
292                final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
293            cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
294        else:
295            hi = cpset[-1][1]
296            final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
297            cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
298        test_expr = Var(final_byte_var)
299        if byte_no > 1: 
300           pfx1 = utf8_prefix_var(lo, byte_no-1)
301           pfx1_adv = pfx1 + "_adv"
302           cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
303           test_expr = make_and(Var(pfx1_adv), test_expr)
304        cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
305        return
306    #
307    #
308    for rg in cpset:
309            (lo, hi) = rg
310            lbyte = utf8_byte(lo, byte_no)
311            hbyte = utf8_byte(hi, byte_no)
312            if lbyte != hbyte:
313                if not is_low_codepoint_after_byte(lo, byte_no):
314                    lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
315                    #print "lo--lo1:  %x--%x (%i)" % (lo, lo1, byte_no)
316                    utf8_sequence_generator([(lo, lo1)], byte_no, targetVar, cgo)
317                    utf8_sequence_generator([(lo1+1, hi)], byte_no, targetVar, cgo)
318                elif not is_high_codepoint_after_byte(hi, byte_no):
319                    hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
320                    #print "lo--hi1-1:  %x--%x (%i)" % (lo, hi1-1, byte_no)
321                    utf8_sequence_generator([(lo, hi1-1)], byte_no, targetVar, cgo)
322                    utf8_sequence_generator([(hi1, hi)], byte_no, targetVar, cgo)
323                else:
324                    # we have a prefix group of type (a)
325                    #print "lo--hi:  %x--%x (%i)" % (lo, hi, byte_no)
326                    byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
327                    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
328                    if byte_no > 1:
329                           last_prefix = utf8_prefix_var(lo, byte_no - 1)
330                           this_prefix = last_prefix + "_" + byteVar
331                           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
332                    else: this_prefix = byteVar
333                    suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
334                    cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
335                    last_prefix = this_prefix
336                    while byte_no < utf8_length(lo):
337                           byte_no += 1
338                           this_prefix = last_prefix + "_sfx"
339                           cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(suffixVar))))
340                           last_prefix = this_prefix
341                    cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
342            else:
343                # lobyte1 == hybyte1
344                byteVar = "byte_%x" % lbyte
345                cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
346                if byte_no > 1:
347                    last_prefix = utf8_prefix_var(lo, byte_no - 1)
348                    this_prefix = utf8_prefix_var(lo, byte_no)
349                    cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
350                if byte_no < utf8_length(lo): utf8_sequence_generator([rg], byte_no+1, targetVar, cgo)
351
352
353
354
355
356def utf8_prefix_var(codepoint, prefix_bytes):
357   if prefix_bytes == 0: 
358      raise Exception ("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
359   elif prefix_bytes == 1:
360      return "byte_%x" % utf8_byte(codepoint, 1)
361   else: 
362      return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n+1) for n in range(prefix_bytes)])
363
364
365def byte_definitions(range_list, n):
366   #print ["%x--%x" % (p[0], p[1]) for p in range_list]
367   result = [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
368   #print ["%x--%x" % (p[0], p[1]) for p in result]
369   return result
370
371
372
373
374
Note: See TracBrowser for help on using the repository browser.