source: proto/charsetcompiler/unicode_category_compiler.py @ 4259

Last change on this file since 4259 was 4223, checked in by cameron, 5 years ago

More functions for utf8 lib

File size: 17.7 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from utf8_lib import *
10from pablo_expr import *
11from CC_compiler import *
12from UTF_encoding import *
13from charset_def import *
14from UCD.general_category import *
15import optparse, sys
16
17
18# Generate a simplest possible test for a Unicode codepoint range
19# such that each 1 bit marks a position within a UTF-8 initial
20# subsequence such that each legal continuation of that subsequence
21# is within the range.  Return the generated variable.
22#
23# The test may be made up of up to three parts:
24# (a) a multibyte low-boundary test,
25# (b) a multibyte high-boundary test, and
26# (c) a range test.
27# It is possible that the low- and high- boundary tests have
28# a common multibyte prefix.
29def utf8_iftest_compiler(cgo, lo, hi):
30   lo_byte = utf8_byte(lo, 1)
31   hi_byte = utf8_byte(hi, 1)
32   if lo_byte == hi_byte:
33      targetVar = "cp_range_%x_%x" % (lo, hi)
34      utf8_sequence_generator([(lo, hi)], 1, targetVar, cgo)
35      return targetVar
36   if lo > 0 and utf8_byte(lo - 1, 1) == lo_byte:
37      lo1 = max_codepoint_with_initial_byte(lo_byte)
38      targetVar = "cp_range_%x_%x" % (lo, lo1)
39      utf8_sequence_generator([(lo, lo1)], 1, targetVar, cgo)
40      test_expr1 = Var(targetVar)
41      lo_byte = utf8_byte(lo1 + 1, 1)
42   else:
43      test_expr1 = FalseLiteral()
44      if lo == 0x80: lo_byte = 0xC0
45   if hi < 0x10FFFF and utf8_byte(hi + 1, 1) == hi_byte:
46      hi1 = min_codepoint_with_initial_byte(hi_byte)
47      targetVar = "cp_range_%x_%x" % (hi1, hi)
48      utf8_sequence_generator([(hi1, hi)], 1, targetVar, cgo)
49      test_expr2 = Var(targetVar)
50      hi_byte = utf8_byte(hi1 - 1, 1)
51   else:
52      test_expr2 = FalseLiteral()
53      if hi == 0x10FFFF: hi_byte = 0xFF
54   if lo_byte > hi_byte: return cgo.expr_string_to_variable(cgo.expr2py(make_or(test_expr1, test_expr2)))
55   if lo_byte == hi_byte:
56      byteVar = "byte_%x" % lo_byte
57   else:
58      byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
59   cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
60   return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(byteVar), make_or(test_expr1, test_expr2))))
61
62
63def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
64#   inner_code = []
65   (outer_lo, outer_hi) = enclosingRange
66   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
67   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
68   for rg in missingRanges:
69     (rglo, rghi) = rg
70     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap)
71   topRanges = outerRanges(enclosedRanges)
72   inner = innerRanges(enclosedRanges)
73   for rg in topRanges:
74     (rglo, rghi) = rg
75     empty_range = True
76     for k in charClassMap.keys():
77        if rangeIntersect(charClassMap[k], rglo, rghi) != []:
78           empty_range = False
79           break
80     if not empty_range:
81       range_var = utf8_iftest_compiler(cgo, rglo, rghi)
82       inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
83       inner_cgo.add_common_expressions(cgo)
84       generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
85       if inner_cgo.generated_code != []:
86         cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
87   return cgo.showcode()
88
89def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
90   for k in charClassMap.keys():
91     if options.grep:
92        targetVar = "all_chars"
93     else:
94        targetVar = "struct_%s.cc" % k
95     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
96     # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
97     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
98        (lo1, hi1) = byte_range
99        subcc2 = rangeIntersect(subcc1, lo1, hi1)
100        utf8_sequence_generator(subcc2, 1, targetVar, cgo)
101
102def rangeIntersect(ccList, lo, hi):
103    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
104
105def rangeGaps(ccList, lo, hi):
106    if lo >= hi: return []
107    if ccList == []: return [(lo, hi)]
108    (lo1, hi1) = ccList[0]
109    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
110    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
111    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
112    else: return []
113
114def outerRanges(ccList):
115    if len(ccList) <= 1: return ccList
116    (lo1, hi1) = ccList[0]
117    (lo2, hi2) = ccList[1]
118    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
119    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
120
121def innerRanges(ccList):
122    if len(ccList) <= 1: return []
123    (lo1, hi1) = ccList[0]
124    (lo2, hi2) = ccList[1]
125    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
126    else: return innerRanges(ccList[1:])
127
128
129
130def generateCharClassDefs(ifRangeList, charClassMap):
131   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
132   for k in charClassMap.keys():
133     if options.grep:
134         cgo.add_assignment("all_chars", '0')
135     else:
136         cgo.add_assignment("struct_%s.cc" % k, '0')
137   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
138   return cgo.showcode()
139 
140
141#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
142
143#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
144
145
146defaultIfRangeList = [
147#Non-ASCII
148(0x80,0x10FFFF), 
149#Two-byte sequences
150(0x80,0x7FF), 
151(0x100, 0x3FF), 
152#0100..017F; Latin Extended-A
153#0180..024F; Latin Extended-B
154#0250..02AF; IPA Extensions
155#02B0..02FF; Spacing Modifier Letters
156(0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
157#0300..036F; Combining Diacritical Marks
158#0370..03FF; Greek and Coptic
159(0x300, 0x36F), (0x370, 0x3FF),
160#0400..04FF; Cyrillic
161#0500..052F; Cyrillic Supplement
162#0530..058F; Armenian
163#0590..05FF; Hebrew
164#0600..06FF; Arabic
165(0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
166#0700..074F; Syriac
167#0750..077F; Arabic Supplement
168#0780..07BF; Thaana
169#07C0..07FF; NKo
170(0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF), 
171#Three-byte sequences
172(0x800, 0xFFFF),
173(0x800, 0x4DFF),
174(0x800, 0x1FFF),
175(0x800, 0x0FFF),
176(0x1000, 0x1FFF),
177#0800..083F; Samaritan
178#0840..085F; Mandaic
179#08A0..08FF; Arabic Extended-A
180#0900..097F; Devanagari
181#0980..09FF; Bengali
182#0A00..0A7F; Gurmukhi
183#0A80..0AFF; Gujarati
184#0B00..0B7F; Oriya
185#0B80..0BFF; Tamil
186#0C00..0C7F; Telugu
187#0C80..0CFF; Kannada
188#0D00..0D7F; Malayalam
189#0D80..0DFF; Sinhala
190#0E00..0E7F; Thai
191#0E80..0EFF; Lao
192#0F00..0FFF; Tibetan
193(0x1000, 0x1FFF),
194#1000..109F; Myanmar
195#10A0..10FF; Georgian
196#1100..11FF; Hangul Jamo
197#1200..137F; Ethiopic
198#1380..139F; Ethiopic Supplement
199#13A0..13FF; Cherokee
200#1400..167F; Unified Canadian Aboriginal Syllabics
201#1680..169F; Ogham
202#16A0..16FF; Runic
203#1700..171F; Tagalog
204#1720..173F; Hanunoo
205#1740..175F; Buhid
206#1760..177F; Tagbanwa
207#1780..17FF; Khmer
208#1800..18AF; Mongolian
209#18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
210#1900..194F; Limbu
211#1950..197F; Tai Le
212#1980..19DF; New Tai Lue
213#19E0..19FF; Khmer Symbols
214#1A00..1A1F; Buginese
215#1A20..1AAF; Tai Tham
216#1AB0..1AFF; Combining Diacritical Marks Extended
217#1B00..1B7F; Balinese
218#1B80..1BBF; Sundanese
219#1BC0..1BFF; Batak
220#1C00..1C4F; Lepcha
221#1C50..1C7F; Ol Chiki
222#1CC0..1CCF; Sundanese Supplement
223#1CD0..1CFF; Vedic Extensions
224#1D00..1D7F; Phonetic Extensions
225#1D80..1DBF; Phonetic Extensions Supplement
226#1DC0..1DFF; Combining Diacritical Marks Supplement
227#1E00..1EFF; Latin Extended Additional
228#1F00..1FFF; Greek Extended
229(0x2000, 0x4DFF),(0x2000, 0x2FFF),
230(0x3000, 0x4DFF),
231(0x4E00,0x9FFF),
232#4E00..9FFF; CJK Unified Ideographs
233(0xA000,0xFFFF),
234
235(0x10000, 0x10FFFF)]
236
237
238Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
239Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
240Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
241
242def generateDefs1(general_category):
243  catmap = {}
244  catmap[general_category] = UnicodeCategoryMap[general_category]
245  struct = Unicode_CC_struct % (general_category)
246  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
247  if options.grep:
248        struct = r"""
249class Basis_bits():
250        bit_0 = 0
251        bit_1 = 0
252        bit_2 = 0
253        bit_3 = 0
254        bit_4 = 0
255        bit_5 = 0
256        bit_6 = 0
257        bit_7 = 0 
258 
259class Lex():
260        LF = (0)
261 
262class Output():
263        matches = 0
264
265def ParseLines(basis_bits, lex):
266        temp1 = (basis_bits.bit_0 | basis_bits.bit_1)
267        temp2 = (basis_bits.bit_2 | basis_bits.bit_3)
268        temp3 = (temp1 | temp2)
269        temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5)
270        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7)
271        temp6 = (temp4 & temp5)
272        lex.LF = (temp6 &~ temp3)
273
274"""
275        header = "def Demo(basis_bits, lex, output):\n"
276  else:
277        struct = Unicode_CC_struct % (general_category)
278        header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
279  if options.flat:
280      code = generateCharClassDefs([], catmap)
281  elif options.simple:
282      code = generateCharClassDefs([(0x80, 0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFF)], catmap)
283  else:
284      code = generateCharClassDefs(defaultIfRangeList, catmap)
285  if options.grep:
286      code += r"""
287        output.matches = 0
288        all_matches = pablo.Advance(all_chars & ~lex.LF)
289        if all_matches:
290                # Find the last match on each line and filter out all others
291                output.matches = pablo.MatchStar(all_matches, ~lex.LF) & lex.LF
292"""
293  return struct + header + "".join(code)
294
295def generate_main():
296  if options.grep:
297        main = "\n\ndef Main(basis_bits, lex, output):\n    ParseLines(basis_bits, lex)\n    Demo(basis_bits, lex, output)\n"
298  else:
299        main = Unicode_dummy_main
300  return main
301
302#
303# Partition a list of ranges into a minimum set of utf8 groups
304# UTF-8 prefix groups, where a group is
305# (a) a range of codepoints with UTF-8 prefixes of the same length
306#     such that every codepoint in the range is within the group, or
307# (b) a sublist all having the same UTF-8 initial
308#     byte
309def partition_by_UTF8_group(range_list, byte_no):
310    if range_list == []: return []
311    (lo, hi) = range_list[0]
312    u8len_lo = utf8_length(lo)
313    u8len_hi = utf8_length(hi)
314    if u8len_lo != u8len_hi:
315        mid = max_codepoint_of_length(u8len_lo)
316        return partition_by_UTF8_group([(lo, mid), (mid+1, hi)] + range_list[1:], byte_no)
317    lobyte1 = utf8_byte(lo, byte_no)
318    hibyte1 = utf8_byte(hi, byte_no)
319    if lobyte1 != hibyte1:
320        if not is_low_codepoint_after_byte(lo, byte_no):
321            lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
322            #print "lo--lo1:  %x--%x" % (lo, lo1)
323            return [[(lo, lo1)]] + partition_by_UTF8_group([(lo1+1, hi)] + range_list[1:], byte_no)
324        elif not is_high_codepoint_after_byte(hi, byte_no):
325            hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
326            #print "lo--hi-1:  %x--%x" % (lo, hi1-1)
327            return [[(lo, hi1-1)]] + partition_by_UTF8_group([(hi1, hi)] + range_list[1:], byte_no)
328        else:
329            # we have a prefix group of type (a)
330            return [[(lo, hi)]] + partition_by_UTF8_group(range_list[1:], byte_no)
331    group1 = [(lo, hi)]
332    subpartitions = partition_by_UTF8_group(range_list[1:], byte_no)
333    if subpartitions == []: return [group1]
334    elif utf8_byte(subpartitions[0][0][0], byte_no) == lobyte1:
335        return [group1 + subpartitions[0]] + subpartitions[1:]
336    else:
337        return [group1] + subpartitions
338
339# Ensure the sequence of preceding bytes is defined, up to, but
340# not including the given byte_no
341def ensure_preceding_prefix_defined(codepoint, byte_no, cgo):
342   for i in range(1, byte_no):
343      byte_i = utf8_byte(codepoint, i)
344      byteVar = "byte_%x" % byte_i
345      cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(byte_i, byte_i)]))
346      if i > 1:
347         pfx1 = utf8_prefix_var(codepoint, i-1)
348         pfx1_adv = pfx1 + "_adv"
349         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
350         pfx2 = utf8_prefix_var(codepoint, i)
351         cgo.add_canonical_assignment(pfx2, cgo.expr2py(make_and(Var(pfx1_adv), Var(byteVar))))
352
353
354#
355# Generate remaining code to match UTF-8 code sequences within
356# the codepoint set u8_partition, assuming that the code matching the
357# sequences up to byte number byte_no have been generated.
358#
359def utf8_sequence_generator(u8_partition, byte_no, targetVar, cgo):
360   if len(u8_partition) == 0: return
361   (lo, hi) = u8_partition[0]
362   if utf8_length(lo) == byte_no:
363      # We have a single byte remaining to match for all codepoints
364      # in this partition.  Use the byte class compiler to generate
365      # matches for these codepoints.
366      ensure_preceding_prefix_defined(lo, byte_no, cgo)
367      byte_pair_list = byte_definitions(u8_partition, byte_no)
368      #print byte_pair_list
369      if len(byte_pair_list) == 1:
370          (lobyte, hibyte) = byte_pair_list[0]
371          if lo == hi:
372              final_byte_var = "byte_%x" % lobyte
373          else:
374              final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
375          cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
376      else:
377          hi = u8_partition[-1][0]
378          final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
379          cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
380      test_expr = Var(final_byte_var)
381      if byte_no > 1: 
382         pfx1 = utf8_prefix_var(lo, byte_no-1)
383         pfx1_adv = pfx1 + "_adv"
384         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
385         test_expr = make_and(Var(pfx1_adv), test_expr)
386      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
387   else:
388     partitions = partition_by_UTF8_group(u8_partition, byte_no)
389     for p in partitions:
390       (lo, hi) = p[0]
391       lbyte = utf8_byte(lo, byte_no)
392       hbyte = utf8_byte(hi, byte_no)
393       ensure_preceding_prefix_defined(lo, byte_no, cgo)
394       if lbyte == hbyte:
395         byteVar = "byte_%x" % lbyte
396         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
397         if byte_no > 1:
398           last_prefix = utf8_prefix_var(lo, byte_no - 1)
399           this_prefix = utf8_prefix_var(lo, byte_no)
400           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
401         if byte_no < utf8_length(lo): utf8_sequence_generator(p, byte_no+1, targetVar, cgo)
402       else:
403         byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
404         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
405         if byte_no > 1:
406           last_prefix = utf8_prefix_var(lo, byte_no - 1)
407           this_prefix = last_prefix + "_" + byteVar
408           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
409         else: this_prefix = byteVar
410         suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
411         cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
412         last_prefix = this_prefix
413         while byte_no < utf8_length(lo):
414           byte_no += 1
415           this_prefix = last_prefix + "_sfx"
416           cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(suffixVar))))
417           last_prefix = this_prefix
418         cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
419
420
421
422def utf8_prefix_var(codepoint, prefix_bytes):
423   if prefix_bytes == 0: 
424      raise Exception ("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
425   elif prefix_bytes == 1:
426      return "byte_%x" % utf8_byte(codepoint, 1)
427   else: 
428      return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n+1) for n in range(prefix_bytes)])
429
430
431def byte_definitions(range_list, n):
432   #print ["%x--%x" % (p[0], p[1]) for p in range_list]
433   result = [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
434   #print ["%x--%x" % (p[0], p[1]) for p in result]
435   return result
436
437def main():   
438
439    global options, UnicodeCategoryMap
440    # Option definition
441    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
442 
443    option_parser.add_option('-c', '--category',
444                             dest='category',
445                             type='string',
446                             default='Cc',
447                             help='general category; default: Cc',
448                             )
449    option_parser.add_option('-g', '--grep',
450                             dest='grep',
451                             action='store_true',
452                             default=False,
453                             help='Use grep template',
454                             ) 
455    option_parser.add_option('-f', '--flat',
456                             dest='flat',
457                             action='store_true',
458                             default=False,
459                             help='Flatten the calculations into a single basic block',
460                             ) 
461    option_parser.add_option('-s', '--simple',
462                             dest='simple',
463                             action='store_true',
464                             default=False,
465                             help='Use a simple if-structure on UTF-8 length',
466                             ) 
467    options, args = option_parser.parse_args(sys.argv[1:])
468
469    (catlen, UnicodeCategoryMap) = parse_general()
470   
471    code = ""
472    if options.category == '.':
473        for k in UnicodeCategoryMap.keys():
474            code += generateDefs1(k)
475   
476    else:
477        if options.category not in UnicodeCategoryMap:
478            #define the characters in the list
479            print "Unknown general category %s" % options.category
480            exit
481        code = generateDefs1(options.category)
482
483
484    code += generate_main()
485
486    if (len(args) == 1):
487        fh = open(args[0], "w")
488        fh.write(code)
489        fh.close()
490    elif len(args) == 0:
491        print code
492    else:
493        option_parser.print_usage()
494       
495
496
497
498
499if __name__ == "__main__": main()
500
501
502
503
504
505
506
Note: See TracBrowser for help on using the repository browser.