source: proto/charsetcompiler/if_hierarchy.py @ 4911

Last change on this file since 4911 was 4630, checked in by nmedfort, 4 years ago

Passing last used prefix instead of regenerating it implicitly by name.

File size: 17.5 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from utf8_lib import *
10from CC_compiler import *
11from UTF_encoding import *
12from charset_def import *
13
14
15# Generate a simplest possible test for a Unicode codepoint range
16# such that each 1 bit marks a position within a UTF-8 initial
17# subsequence such that each legal continuation of that subsequence
18# is within the range.  Return the generated variable.
19#
20# The test may be made up of up to three parts:
21# (a) a multibyte low-boundary test,
22# (b) a multibyte high-boundary test, and
23# (c) a range test.
24# It is possible that the low- and high- boundary tests have
25# a common multibyte prefix.
26# def utf8_iftest_compiler(cgo, lo, hi):
27#     targetVar = "cp_range_%x_%x" % (lo, hi)
28#     return utf8_iftest_helper(cgo, lo, hi, 1, targetVar, TrueLiteral())
29#
30#
31# def utf8_iftest_helper(cgo, lo, hi, byte_no, targetVar, marker):
32#     lo_byte = utf8_byte(lo, byte_no)
33#     hi_byte = utf8_byte(hi, byte_no)
34#     at_lo_boundary = lo == 0 or utf8_byte(lo - 1, byte_no) != lo_byte
35#     at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi + 1, byte_no) != hi_byte
36#     if at_lo_boundary and at_hi_boundary:
37#         if lo_byte == hi_byte:
38#             byteVar = "byte_%x" % lo_byte
39#         else:
40#             if lo == 0x80: lo_byte = 0xC0
41#             if hi == 0x10FFFF: hi_byte = 0xFF
42#             byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
43#         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
44#         return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
45#     elif lo_byte == hi_byte:
46#         byteVar = "byte_%x" % lo_byte
47#         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
48#         new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
49#         return utf8_iftest_helper(cgo, lo, hi, byte_no + 1, targetVar, new_marker)
50#     elif not at_hi_boundary:
51#         hi1 = min_codepoint_with_common_bytes(hi, byte_no)
52#         e1 = utf8_iftest_helper(cgo, lo, hi1 - 1, byte_no, targetVar, marker)
53#         e2 = utf8_iftest_helper(cgo, hi1, hi, byte_no, targetVar, marker)
54#         return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
55#     else:  # if at_hi_boundary:
56#         lo1 = max_codepoint_with_common_bytes(lo, byte_no)
57#         e1 = utf8_iftest_helper(cgo, lo, lo1, byte_no, targetVar, marker)
58#         e2 = utf8_iftest_helper(cgo, lo1 + 1, hi, byte_no, targetVar, marker)
59#         return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
60def utf8_iftest_compiler(cgo, lo, hi):
61    return utf8_iftest_helper(cgo, lo, hi, 1, TrueLiteral())
62
63
64def utf8_iftest_helper(cgo, lo, hi, byte_no, marker):
65    lo_byte = utf8_byte(lo, byte_no)
66    hi_byte = utf8_byte(hi, byte_no)
67    at_lo_boundary = lo == 0 or utf8_byte(lo - 1, byte_no) != lo_byte
68    at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi + 1, byte_no) != hi_byte
69    if at_lo_boundary and at_hi_boundary:
70        if lo_byte == hi_byte:
71            byteVar = "byte_%x" % lo_byte
72        else:
73            if lo == 0x80: lo_byte = 0xC0
74            if hi == 0x10FFFF: hi_byte = 0xFF
75            byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
76        cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
77        return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
78    elif lo_byte == hi_byte:
79        byteVar = "byte_%x" % lo_byte
80        cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
81        new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
82        return utf8_iftest_helper(cgo, lo, hi, byte_no + 1, new_marker)
83    elif not at_hi_boundary:
84        mid = min_codepoint_with_common_bytes(hi, byte_no)
85        e1 = utf8_iftest_helper(cgo, lo, mid - 1, byte_no, marker)
86        e2 = utf8_iftest_helper(cgo, mid, hi, byte_no, marker)
87        return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
88    else:  # if at_hi_boundary:
89        mid = max_codepoint_with_common_bytes(lo, byte_no)
90        e1 = utf8_iftest_helper(cgo, lo, mid, byte_no, marker)
91        e2 = utf8_iftest_helper(cgo, mid + 1, hi, byte_no, marker)
92        return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
93
94def min_codepoint_with_common_bytes(cp, byte_no):
95    u8len = utf8_length(cp)
96    mask = (1 << (u8len - byte_no) * 6) - 1
97    lo_cp = cp & ~ mask
98    if lo_cp == 0:
99        return mask + 1
100    else:
101        return lo_cp
102
103
104def max_codepoint_with_common_bytes(cp, byte_no):
105    u8len = utf8_length(cp)
106    mask = (1 << (u8len - byte_no) * 6) - 1
107    return cp | mask
108
109
110def generateCharClassDefsInIfHierarchy(cgo, ifRangeList, charClassMap, enclosingRange, template_var):
111    #   inner_code = []
112    (lo, hi) = enclosingRange
113    enclosedRanges = rangeIntersect(ifRangeList, lo, hi)
114    missingRanges = rangeGaps(enclosedRanges, lo, hi)
115    # Codepoints in unenclosed ranges will be computed unconditionally.
116    # Generate them first so that computed subexpressions may be shared
117    # with calculations within the if hierarchy.
118    for rg in missingRanges:
119        (lo, hi) = rg
120        generateCharClassSubDefs(cgo, lo, hi, charClassMap, template_var)
121    if len(enclosedRanges) > 0:
122        topRanges = outerRanges(enclosedRanges)
123        inner = innerRanges(enclosedRanges)
124        for rg in topRanges:
125            (lo, hi) = rg
126            empty = True
127            for k in charClassMap.keys():
128                if len(rangeIntersect(charClassMap[k], lo, hi)) > 0:
129                    empty = False
130                    break
131            if not empty:
132                inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (lo, hi) + '_tmp%i', False, '')
133                inner_cgo.add_common_expressions(cgo)
134                generateCharClassDefsInIfHierarchy(inner_cgo, inner, charClassMap, rg, template_var)
135                if len(inner_cgo.generated_code) > 0:
136                    cgo.add_if_stmt(Var(utf8_iftest_compiler(cgo, lo, hi)), inner_cgo.generated_code)
137    return cgo.showcode()
138
139def generateCharClassSubDefs(cgo, lo, hi, charClassMap, template_var):
140    for k in charClassMap.keys():
141        targetVar = template_var % k
142        subcc1 = rangeIntersect(charClassMap[k], lo, hi)
143        # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
144        for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xD800, 0xDFFF),
145                           (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
146            (lo1, hi1) = byte_range
147            subcc2 = rangeIntersect(subcc1, lo1, hi1)
148            utf8_sequence_generator(cgo, 1, targetVar, subcc2)
149
150
151def rangeIntersect(range, lo, hi):
152    return [(max(lo, p[0]), min(hi, p[1])) for p in range if p[0] <= hi and p[1] >= lo]
153
154
155def rangeGaps(range, lo, hi):
156    gaps = []
157    if lo < hi:
158        if len(range) == 0:
159            gaps.append((lo, hi))
160        else:
161            for item in range:
162                (lo1, hi1) = item
163                if hi1 < lo:
164                    continue
165                elif lo1 > lo:
166                    gaps.append((lo, lo1 - 1))
167                elif hi1 >= hi:
168                    continue
169                lo = hi1 + 1
170    return gaps
171
172def outerRanges(ccList):
173    ranges = []
174    if (len(ccList) > 0):
175        i = 0
176        for j in range(1, len(ccList)):
177            (lo1, hi1) = ccList[i]
178            (lo2, hi2) = ccList[j]
179            if hi2 > hi1:
180                ranges.append(ccList[i])
181                i = j
182        if i < len(ccList):
183            ranges.append(ccList[i])
184    return ranges
185
186def innerRanges(ccList):
187    ranges = []
188    if (len(ccList) > 0):
189        i = 0
190        for j in range(1, len(ccList)):
191            (lo1, hi1) = ccList[i]
192            (lo2, hi2) = ccList[j]
193            if hi2 <= hi1:
194                ranges.append(ccList[j])
195            else:
196                i = j
197    return ranges
198
199def generateCharClassDefs(ifRangeList, charClassMap, template_var):
200    cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
201    for k in charClassMap.keys():
202        print template_var % k
203        cgo.add_assignment(template_var % k, '0')
204    generateCharClassDefsInIfHierarchy(cgo, ifRangeList, charClassMap, (0, 0x10FFFF), template_var)
205    return cgo.showcode()
206
207# defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
208
209# defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
210
211
212defaultIfRangeList = [
213    # Non-ASCII
214    (0x80, 0x10FFFF),
215    # Two-byte sequences
216    (0x80, 0x7FF),
217    (0x100, 0x3FF),
218    # 0100..017F; Latin Extended-A
219    # 0180..024F; Latin Extended-B
220    # 0250..02AF; IPA Extensions
221    # 02B0..02FF; Spacing Modifier Letters
222    (0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
223    # 0300..036F; Combining Diacritical Marks
224    # 0370..03FF; Greek and Coptic
225    (0x300, 0x36F), (0x370, 0x3FF),
226    # 0400..04FF; Cyrillic
227    # 0500..052F; Cyrillic Supplement
228    # 0530..058F; Armenian
229    # 0590..05FF; Hebrew
230    # 0600..06FF; Arabic
231    (0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
232    # 0700..074F; Syriac
233    # 0750..077F; Arabic Supplement
234    # 0780..07BF; Thaana
235    # 07C0..07FF; NKo
236    (0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF),
237    # Three-byte sequences
238    (0x800, 0xFFFF),
239    (0x800, 0x4DFF),
240    (0x800, 0x1FFF),
241    (0x800, 0x0FFF),
242    # 0800..083F; Samaritan
243    # 0840..085F; Mandaic
244    # 08A0..08FF; Arabic Extended-A
245    # 0900..097F; Devanagari
246    # 0980..09FF; Bengali
247    # 0A00..0A7F; Gurmukhi
248    # 0A80..0AFF; Gujarati
249    # 0B00..0B7F; Oriya
250    # 0B80..0BFF; Tamil
251    # 0C00..0C7F; Telugu
252    # 0C80..0CFF; Kannada
253    # 0D00..0D7F; Malayalam
254    # 0D80..0DFF; Sinhala
255    # 0E00..0E7F; Thai
256    # 0E80..0EFF; Lao
257    # 0F00..0FFF; Tibetan
258    (0x1000, 0x1FFF),
259    # 1000..109F; Myanmar
260    # 10A0..10FF; Georgian
261    # 1100..11FF; Hangul Jamo
262    # 1200..137F; Ethiopic
263    # 1380..139F; Ethiopic Supplement
264    # 13A0..13FF; Cherokee
265    # 1400..167F; Unified Canadian Aboriginal Syllabics
266    # 1680..169F; Ogham
267    # 16A0..16FF; Runic
268    # 1700..171F; Tagalog
269    # 1720..173F; Hanunoo
270    # 1740..175F; Buhid
271    # 1760..177F; Tagbanwa
272    # 1780..17FF; Khmer
273    # 1800..18AF; Mongolian
274    # 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
275    # 1900..194F; Limbu
276    # 1950..197F; Tai Le
277    # 1980..19DF; New Tai Lue
278    # 19E0..19FF; Khmer Symbols
279    # 1A00..1A1F; Buginese
280    # 1A20..1AAF; Tai Tham
281    # 1AB0..1AFF; Combining Diacritical Marks Extended
282    # 1B00..1B7F; Balinese
283    # 1B80..1BBF; Sundanese
284    # 1BC0..1BFF; Batak
285    # 1C00..1C4F; Lepcha
286    # 1C50..1C7F; Ol Chiki
287    # 1CC0..1CCF; Sundanese Supplement
288    # 1CD0..1CFF; Vedic Extensions
289    # 1D00..1D7F; Phonetic Extensions
290    # 1D80..1DBF; Phonetic Extensions Supplement
291    # 1DC0..1DFF; Combining Diacritical Marks Supplement
292    # 1E00..1EFF; Latin Extended Additional
293    # 1F00..1FFF; Greek Extended
294    (0x2000, 0x4DFF), (0x2000, 0x2FFF),
295    (0x3000, 0x4DFF),
296    (0x4E00, 0x9FFF),
297    # 4E00..9FFF; CJK Unified Ideographs
298    (0xA000, 0xFFFF),
299
300    (0x10000, 0x10FFFF)]
301
302# Ensure the sequence of preceding bytes is defined, up to, but
303# not including the given byte_no
304def make_prefix(cgo, codepoint, byte_no, prefix):
305    for i in range(1, byte_no):
306        byte_i = utf8_byte(codepoint, i)
307        var = "byte_%x" % byte_i
308        cgo.chardef_canonical(CanonicalCharSetDef(prefix, [(byte_i, byte_i)]))
309        if i > 1:
310            adv_prefix = prefix + "_adv"
311            cgo.add_canonical_assignment(adv_prefix, cgo.expr2py(make_shift_forward(Var(prefix), 1)))
312            next_prefix = utf8_prefix_var(codepoint, i)
313            cgo.add_canonical_assignment(next_prefix, cgo.expr2py(make_and(Var(adv_prefix), Var(var))))
314            var = next_prefix
315        prefix = var
316    return prefix
317
318#
319# Generate remaining code to match UTF-8 code sequences within
320# the codepoint set cpset, assuming that the code matching the
321# sequences up to byte number byte_no have been generated.
322#
323def utf8_sequence_generator(cgo, byte_no, target, cc, prefix = None):
324    if len(cc) == 0:
325        return
326    (lo, hi) = cc[0]
327    u8len_min = utf8_length(lo)
328    u8len_max = utf8_length(cc[-1][1])
329
330    # print " -- ", byte_no, "  ", cc[0], "    ", (u8len_min, u8len_max)
331
332    assert(u8len_min == u8len_max)
333
334    if u8len_min != u8len_max:
335
336        mid = max_codepoint_of_length(u8len_min)
337        utf8_sequence_generator(cgo, byte_no, target, rangeIntersect(cc, lo, mid), prefix)
338        utf8_sequence_generator(cgo, byte_no, target, rangeIntersect(cc, mid + 1, hi), prefix)
339
340    elif u8len_min == byte_no:
341        # We have a single byte remaining to match for all code points
342        # in this cc.  Use the byte class compiler to generate matches
343        # for these code points.
344
345        byte_pairs = byte_definitions(cc, byte_no)
346        if len(byte_pairs) == 1:
347            (lobyte, hibyte) = byte_pairs[0]
348            if lo == hi:
349                final_byte_var = "byte_%x" % lobyte
350            else:
351                final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
352            cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pairs))
353        else:
354            hi = cc[-1][1]
355            final_byte_var = "cp_range_%x_%x_%i" % (lo, hi, byte_no)
356            cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pairs))
357        test_expr = Var(final_byte_var)
358        if byte_no > 1:
359            pfx1 = make_prefix(cgo, lo, byte_no, prefix)
360            pfx1_adv = pfx1 + "_adv"
361            cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
362            test_expr = make_and(Var(pfx1_adv), test_expr)
363        cgo.add_assignment(target, cgo.expr2py(make_or(Var(target), test_expr)))
364
365    else:
366
367        for rg in cc:
368            (lo, hi) = rg
369            lbyte = utf8_byte(lo, byte_no)
370            hbyte = utf8_byte(hi, byte_no)
371
372            if lbyte != hbyte:
373                if not is_low_codepoint_after_byte(lo, byte_no):
374                    mid = lo | ((1 << (6 * (u8len_min - byte_no))) - 1)
375                    utf8_sequence_generator(cgo, byte_no, target, [(lo, mid)], prefix)
376                    utf8_sequence_generator(cgo, byte_no, target, [(mid + 1, hi)], prefix)
377                elif not is_high_codepoint_after_byte(hi, byte_no):
378                    mid = hi & ~ ((1 << (6 * (u8len_min - byte_no))) - 1)
379                    utf8_sequence_generator(cgo, byte_no, target, [(lo, mid - 1)], prefix)
380                    utf8_sequence_generator(cgo, byte_no, target, [(mid, hi)], prefix)
381                else:
382                    # we have a prefix group of type (a)
383                    var = "byte_range_%x_%x" % (lbyte, hbyte)
384                    cgo.chardef_canonical(CanonicalCharSetDef(var, [(lbyte, hbyte)]))
385                    if byte_no > 1:
386                        last_prefix = prefix
387                        assert(last_prefix == utf8_prefix_var(lo, byte_no - 1))
388                        last_prefix_adv = last_prefix + "_adv"
389                        this_prefix = last_prefix + "_" + var
390                        cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
391                        cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(var))))
392                    else:
393                        this_prefix = var
394                    suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
395                    cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
396                    last_prefix = this_prefix
397                    while byte_no < utf8_length(lo):
398                        byte_no += 1
399                        last_prefix_adv = last_prefix + "_adv"
400                        cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
401                        this_prefix = utf8_prefix_var(lo, byte_no)
402                        cgo.add_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(suffixVar))))
403                        last_prefix = this_prefix
404                    cgo.add_assignment(target, cgo.expr2py(make_or(Var(target), Var(last_prefix))))
405
406            else: # lobyte1 == hybyte1
407                var = "byte_%x" % lbyte
408                cgo.chardef_canonical(CanonicalCharSetDef(var, [(lbyte, lbyte)]))
409                if byte_no > 1:
410                    last_prefix = var if prefix == None else prefix
411                    assert(last_prefix == utf8_prefix_var(lo, byte_no - 1))
412                    last_prefix_adv = last_prefix + "_adv"
413                    cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
414                    this_prefix = utf8_prefix_var(lo, byte_no)
415                    cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(var))))
416                    var = this_prefix
417                if byte_no < utf8_length(lo):
418                    utf8_sequence_generator(cgo, byte_no + 1, target, [rg], var)
419
420
421def utf8_prefix_var(codepoint, prefix_bytes):
422    if prefix_bytes == 0:
423        raise Exception("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
424    elif prefix_bytes == 1:
425        return "byte_%x" % utf8_byte(codepoint, 1)
426    else:
427        return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n + 1) for n in range(prefix_bytes)])
428
429def byte_definitions(range_list, n):
430    return [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
Note: See TracBrowser for help on using the repository browser.