source: proto/charsetcompiler/unicode_category_compiler.py @ 3979

Last change on this file since 3979 was 3979, checked in by cameron, 5 years ago

Small fix

File size: 16.6 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from pablo_expr import *
10from CC_compiler import *
11from UTF_encoding import *
12from charset_def import *
13import optparse, sys
14
15#
16# Definitions for debugging/prototyping
17def make_shift_forward(e, n): return Adv(e, n)
18def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
19#
20def utf8_length(codepoint):
21   if codepoint <= 0x7F: return 1
22   elif codepoint <= 0x7FF: return 2
23   elif codepoint <= 0xFFFF: return 3
24   else: return 4
25
26def utf8_byte(codepoint, n):
27   len = utf8_length(codepoint)
28   if n == 1:
29     if len == 1: return codepoint
30     elif len == 2: return 0xC0 + (codepoint >> 6) 
31     elif len == 3: return 0xE0 + (codepoint >> 12) 
32     elif len == 4: return 0xF0 + (codepoint >> 18) 
33   else:
34     bits = (codepoint >> (6 * (len - n))) & 0x3F
35     return 0x80 + bits
36
37def max_codepoint_of_length(n):
38   if n == 1: return 0x7F
39   elif n == 2: return 0x7FF
40   elif n == 3: return 0xFFFF
41   else: return 0x10FFFF
42
43#
44# Given two codepoints lo, hi: return the number of
45# leading UTF-8 bytes that their respective UTF-8
46# representations have in common.
47def common_utf8_leading_bytes(lo, hi):
48   u8len_lo = utf8_length(lo)
49   u8len_hi = utf8_length(hi)
50   if u8len_lo != u8len_hi: return 0
51   remaining = u8len_lo
52   while remaining > 0:
53     if lo == hi: return remaining
54     lo >>= 6
55     hi >>= 6
56     remaining -= 1
57   return 0
58
59def utf8_range_compiler(cgo, lo, hi, targetVar):
60   lo_len = utf8_length(lo)
61   hi_len = utf8_length(hi)
62   # If different length code unit sequences are involved, make
63   # a union of equilength subranges.
64   if hi_len > lo_len:
65     m = max_codepoint_of_length(hi_len - 1)
66     targetV_lo = "%s_%i" % (targetVar, lo_len)   
67     targetV_hi = "%s_%i" % (targetVar, hi_len)
68     utf8_range_compiler(cgo, lo, m, targetV_lo)
69     utf8_range_compiler(cgo, m+1, hi, targetV_hi)
70     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetV_lo), Var(targetV_hi))))
71   #
72   else:
73     matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
74
75
76def matched_sequence_compiler(cgo, lo, hi, hlen):
77   return matched_sequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
78
79def matched_sequence_helper(cgo, lo, hi, prefix, n, hlen):
80   """ Helper function to generate the code necessary to match bytes
81       n through hlen (1-based indexing) of the range of utf-8 sequences
82       for codepoints lo through hi. """
83   hbyte = utf8_byte(hi, n)
84   lbyte = utf8_byte(lo, n)
85   if n == hlen:
86     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
87     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
88     if n == 1: return targetVar
89     return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
90   #
91   # One or more bytes of the lower and upper bound may be the same.
92   # Build a sequence of byte tests.
93   if hbyte == lbyte:
94     targetVar = "bytetest_%x" % (lbyte)
95     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
96     return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
97   # We now have a range involving different bytes at position n.
98   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
99   # A separate test may be needed for the high byte sequence if
100   # there are constraints on following suffix bytes.
101   if hi & following_suffix_mask != following_suffix_mask:
102     hi_floor = hi &~following_suffix_mask     
103     hiVar = matched_sequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
104     loVar = matched_sequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
105     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
106   # A separate test may be needed for the low byte sequence if
107   # there are constraints on following suffix bytes.
108   if lo & following_suffix_mask != 0:
109     low_ceil = lo | following_suffix_mask
110     hiVar = matched_sequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
111     loVar = matched_sequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
112     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
113   #
114   # Now we have a range that permits all suffix combinations.
115   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
116   # has been validated.
117   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
118   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
119   if n == 1: return targetVar
120   return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
121
122
123def matched_ifsequence_compiler(cgo, lo, hi, hlen):
124   return matched_ifsequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
125
126def matched_ifsequence_helper(cgo, lo, hi, prefix, n, hlen):
127   """ Helper function to generate the code necessary to match bytes
128       n through hlen (1-based indexing) of the range of utf-8 sequences
129       for codepoints lo through hi. """
130   hbyte = utf8_byte(hi, n)
131   lbyte = utf8_byte(lo, n)
132   if n == hlen:
133     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
134     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
135     if n == 1: return targetVar
136     else: return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
137   #
138   # One or more bytes of the lower and upper bound may be the same.
139   # Build a sequence of byte tests.
140   if hbyte == lbyte:
141     targetVar = "bytetest_%x" % (lbyte)
142     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
143     return matched_ifsequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
144   # We now have a range involving different bytes at position n.
145   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
146   # A separate test may be needed for the high byte sequence if
147   # there are constraints on following suffix bytes.
148   if hi & following_suffix_mask != following_suffix_mask:
149     hi_floor = hi &~following_suffix_mask     
150     hiVar = matched_ifsequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
151     loVar = matched_ifsequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
152     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
153   # A separate test may be needed for the low byte sequence if
154   # there are constraints on following suffix bytes.
155   if lo & following_suffix_mask != 0:
156     low_ceil = lo | following_suffix_mask
157     hiVar = matched_ifsequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
158     loVar = matched_ifsequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
159     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
160   #
161   # Now we have a range that permits all suffix combinations.
162   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
163   # has been validated.
164   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
165   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
166   if n == 1: return targetVar
167   return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
168
169
170# Generate a simplest possible test for a Unicode codepoint range
171# such that each 1 bit marks a position within a UTF-8 initial
172# subsequence such that each legal continuation of that subsequence
173# is within the range.  Return the generated variable.
174def utf8_ifrange_compiler(cgo, lo, hi):
175   lo_len = utf8_length(lo)
176   hi_len = utf8_length(hi)
177   # If different length code unit sequences are involved, make
178   # a union of equilength subranges.
179   if hi_len > lo_len:
180     m = max_codepoint_of_length(hi_len - 1)
181     v_lo = utf8_ifrange_compiler(cgo, lo, m)
182     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
183     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
184   #
185   else:
186     return matched_ifsequence_compiler(cgo, lo, hi, hi_len)
187
188def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
189  if bytecount == 0: return [make_assign(targetVar, "1")]
190  byte1 = utf8_byte(codepoint, 1)
191  stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
192  byteno = 1
193  while byteno < bytecount:
194    byteno += 1
195    sfx_byte = utf8_byte(codepoint, byteno)
196    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
197  return stmts
198
199def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
200  if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
201  byteno = startbyte
202  while byteno < endbyte:
203    byteno += 1
204    sfx_byte = utf8_byte(codepoint, byteno)
205    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
206  return stmts
207
208import re
209
210Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
211Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
212
213
214def parse_general():
215  category_size = {}
216  category_def = {}
217  f = open("DerivedGeneralCategory.txt")
218  lines = f.readlines()
219  for t in lines:
220    m = Unicode_point_regexp.match(t)
221    if m:
222      point = m.group(1)
223      category = m.group(2)
224      if not category_size.has_key(category):
225        category_size[category] = 0
226        category_def[category] = []
227      pval = int(point, 16)
228      category_def[category].append((pval, pval))
229      category_size[category] += 1     
230    m = Unicode_range_regexp.match(t)
231    if m:
232      point1 = m.group(1)
233      point2 = m.group(2)
234      category = m.group(3)
235      if not category_size.has_key(category):
236        category_size[category] = 0
237        category_def[category] = []
238      pval1 = int(point1, 16)
239      pval2 = int(point2, 16)
240      category_def[category].append((pval1, pval2))
241      category_size[category] += 1
242  return (category_size, category_def)
243  f.close()
244
245
246
247def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
248#   inner_code = []
249   (outer_lo, outer_hi) = enclosingRange
250   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
251   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
252   for rg in missingRanges:
253     (rglo, rghi) = rg
254     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap)
255   topRanges = outerRanges(enclosedRanges)
256   inner = innerRanges(enclosedRanges)
257   for rg in topRanges:
258     (rglo, rghi) = rg
259     inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
260     inner_cgo.add_common_expressions(cgo)
261     generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
262     if inner_cgo.generated_code != []:
263        range_var = utf8_ifrange_compiler(cgo, rglo, rghi)
264        cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
265   return cgo.showcode()
266
267def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
268   for k in charClassMap.keys():
269     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
270     # Divide by UTF-8 length
271     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]:
272        (lo1, hi1) = byte_range
273        subcc2 = rangeIntersect(subcc1, lo1, hi1)
274        ulen = utf8_length(lo1)
275        for subrange in subcc2:
276           (lo2, hi2) = subrange
277           subrangeE = matched_sequence_compiler(cgo, lo2, hi2, ulen)
278           if options.grep:
279              target = "output.matches"
280           else:
281              target = "struct_%s.cc" % k
282           cgo.add_assignment(target, cgo.expr2py(make_or(Var(subrangeE), Var(target))))
283
284def rangeIntersect(ccList, lo, hi):
285    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
286
287def rangeGaps(ccList, lo, hi):
288    if lo >= hi: return []
289    if ccList == []: return [(lo, hi)]
290    (lo1, hi1) = ccList[0]
291    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
292    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
293    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
294    else: return []
295
296def outerRanges(ccList):
297    if len(ccList) <= 1: return ccList
298    (lo1, hi1) = ccList[0]
299    (lo2, hi2) = ccList[1]
300    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
301    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
302
303def innerRanges(ccList):
304    if len(ccList) <= 1: return []
305    (lo1, hi1) = ccList[0]
306    (lo2, hi2) = ccList[1]
307    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
308    else: return innerRanges(ccList[1:])
309
310
311
312def generateCharClassDefs(ifRangeList, charClassMap):
313   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
314   for k in charClassMap.keys():
315     if options.grep:
316         cgo.add_assignment("output.matches", '0')
317     else:
318         cgo.add_assignment("struct_%s.cc" % k, '0')
319   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
320   return cgo.showcode()
321 
322
323#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
324
325#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
326
327defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x100,0x2FF), 
328(0x300,0x36F), (0x370,0x3FF), (0x400,0x7FF), (0x400,0x4FF),  (0x500, 0x52F), (0x530, 0x58F), (0x590,0x5FF), (0x600,0x6FF), 
329(0x700,0x7FF), (0x700,0x74F), (0x750,0x77F), (0x750,0x77F), (0x780,0x7BF), (0x7C0,0x7FF),
330(0x800,0xFFFF), 
331(0x10000, 0x10FFFF)]
332
333
334
335Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
336Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
337Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
338
339def generateDefs1(general_category):
340  catmap = {}
341  catmap[general_category] = UnicodeCategoryMap[general_category]
342  struct = Unicode_CC_struct % (general_category)
343  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
344  if options.grep:
345        struct = r"""
346class Basis_bits():
347        bit_0 = 0
348        bit_1 = 0
349        bit_2 = 0
350        bit_3 = 0
351        bit_4 = 0
352        bit_5 = 0
353        bit_6 = 0
354        bit_7 = 0 
355 
356class Lex():
357        LF = (0)
358 
359class Output():
360        matches = 0
361
362def ParseLines(basis_bits, lex):
363        temp1 = (basis_bits.bit_0 | basis_bits.bit_1)
364        temp2 = (basis_bits.bit_2 | basis_bits.bit_3)
365        temp3 = (temp1 | temp2)
366        temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5)
367        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7)
368        temp6 = (temp4 & temp5)
369        LF = (temp6 &~ temp3)
370
371"""
372        header = "def Demo(basis_bits, lex, output):\n"
373        main = "\n\ndef Main(basis_bits, lex, output):\n    ParseLines(basis_bits, lex)\n    Demo(basis_bits, lex, output)\n"
374  else:
375        struct = Unicode_CC_struct % (general_category)
376        header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
377        main = Unicode_dummy_main
378  if options.flat:
379      code = generateCharClassDefs([], catmap)
380  elif options.simple:
381      code = generateCharClassDefs([(0x80, 0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFF)], catmap)
382  else:
383      code = generateCharClassDefs(defaultIfRangeList, catmap)
384  return struct + header + "".join(code) + main
385
386
387def main():   
388
389    global options, UnicodeCategoryMap
390    # Option definition
391    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
392 
393    option_parser.add_option('-c', '--category',
394                             dest='category',
395                             type='string',
396                             default='Cc',
397                             help='general category; default: Cc',
398                             )
399    option_parser.add_option('-g', '--grep',
400                             dest='grep',
401                             action='store_true',
402                             default=False,
403                             help='Use grep template',
404                             ) 
405    option_parser.add_option('-f', '--flat',
406                             dest='flat',
407                             action='store_true',
408                             default=False,
409                             help='Flatten the calculations into a single basic block',
410                             ) 
411    option_parser.add_option('-s', '--simple',
412                             dest='simple',
413                             action='store_true',
414                             default=False,
415                             help='Use a simple if-structure on UTF-8 length',
416                             ) 
417    options, args = option_parser.parse_args(sys.argv[1:])
418
419    (catlen, UnicodeCategoryMap) = parse_general()
420   
421    code = ""
422    if options.category == '.':
423        for k in UnicodeCategoryMap.keys():
424            code += generateDefs1(k)
425   
426    else:
427        if options.category not in UnicodeCategoryMap:
428            #define the characters in the list
429            print "Unknown general category %s" % options.category
430            exit
431        code = generateDefs1(options.category)
432
433    if (len(args) == 1):
434        fh = open(args[0], "w")
435        fh.write(code)
436        fh.close()
437    elif len(args) == 0:
438        print code
439    else:
440        option_parser.print_usage()
441       
442
443if __name__ == "__main__": main()
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
Note: See TracBrowser for help on using the repository browser.