source: proto/charsetcompiler/unicode_category_compiler.py @ 3964

Last change on this file since 3964 was 3964, checked in by cameron, 5 years ago

Shorten if-range tests

File size: 14.3 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from pablo_expr import *
10from CC_compiler import *
11from UTF_encoding import *
12from charset_def import *
13import optparse, sys
14
15#
16# Definitions for debugging/prototyping
17def make_shift_forward(e, n): return Adv(e, n)
18def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
19#
20def utf8_length(codepoint):
21   if codepoint <= 0x7F: return 1
22   elif codepoint <= 0x7FF: return 2
23   elif codepoint <= 0xFFFF: return 3
24   else: return 4
25
26def utf8_byte(codepoint, n):
27   len = utf8_length(codepoint)
28   if n == 1:
29     if len == 1: return codepoint
30     elif len == 2: return 0xC0 + (codepoint >> 6) 
31     elif len == 3: return 0xE0 + (codepoint >> 12) 
32     elif len == 4: return 0xF0 + (codepoint >> 18) 
33   else:
34     bits = (codepoint >> (6 * (len - n))) & 0x3F
35     return 0x80 + bits
36
37def max_codepoint_of_length(n):
38   if n == 1: return 0x7F
39   elif n == 2: return 0x7FF
40   elif n == 3: return 0xFFFF
41   else: return 0x10FFFF
42
43#
44# Given two codepoints lo, hi: return the number of
45# leading UTF-8 bytes that their respective UTF-8
46# representations have in common.
47def common_utf8_leading_bytes(lo, hi):
48   u8len_lo = utf8_length(lo)
49   u8len_hi = utf8_length(hi)
50   if u8len_lo != u8len_hi: return 0
51   remaining = u8len_lo
52   while remaining > 0:
53     if lo == hi: return remaining
54     lo >>= 6
55     hi >>= 6
56     remaining -= 1
57   return 0
58
59def utf8_range_compiler(cgo, lo, hi, targetVar):
60   lo_len = utf8_length(lo)
61   hi_len = utf8_length(hi)
62   # If different length code unit sequences are involved, make
63   # a union of equilength subranges.
64   if hi_len > lo_len:
65     m = max_codepoint_of_length(hi_len - 1)
66     targetV_lo = "%s_%i" % (targetVar, lo_len)   
67     targetV_hi = "%s_%i" % (targetVar, hi_len)
68     utf8_range_compiler(cgo, lo, m, targetV_lo)
69     utf8_range_compiler(cgo, m+1, hi, targetV_hi)
70     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetV_lo), Var(targetV_hi))))
71   #
72   else:
73     matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
74
75def matched_sequence_compiler(cgo, lo, hi, n, hlen, targetVar):
76   """ Helper function to generate the code necessary to match bytes
77       n through hlen (1-based indexing) of the range of utf-8 sequences
78       for codepoints lo through hi. """
79   hbyte = utf8_byte(hi, n)
80   lbyte = utf8_byte(lo, n)
81   if n == hlen:
82     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
83     return
84   #
85   # One or more bytes of the lower and upper bound may be the same.
86   # Build a sequence of byte tests.
87   if hbyte == lbyte:
88     sfxVar = targetVar + "_sfx"
89     matched_sequence_compiler(cgo, lo, hi, n+1, hlen, sfxVar)
90     CCvar = "CC_%x" % (hbyte)
91     cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
92     cgo.add_assignment(targetVar, cgo.expr2py(make_and(make_shift_forward(Var(CCvar), 1), Var(sfxVar))))
93     return
94   # We now have a range involving different bytes at position n.
95   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
96   # A separate test may be needed for the high byte sequence if
97   # there are constraints on following suffix bytes.
98   if hi & following_suffix_mask != following_suffix_mask:
99     hi_floor = hi &~following_suffix_mask
100     hiVar = targetVar + "_hi"
101     loVar = targetVar + "_lo"
102     matched_sequence_compiler(cgo, hi_floor, hi, n, hlen, hiVar)
103     matched_sequence_compiler(cgo, lo, hi_floor - 1, n, hlen, loVar)
104     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
105     return
106   # A separate test may be needed for the low byte sequence if
107   # there are constraints on following suffix bytes.
108   if lo & following_suffix_mask != 0:
109     low_ceil = lo | following_suffix_mask
110     hiVar = targetVar + "_hi"
111     loVar = targetVar + "_lo"
112     matched_sequence_compiler(cgo, low_ceil + 1, hi, n, hlen, hiVar)
113     matched_sequence_compiler(cgo, lo, low_ceil, n, hlen, loVar)
114     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
115     return 
116   #
117   # Now we have a range that permits all suffix combinations.
118   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
119   # has been validated.
120   CCvar = "CC_%x_%x" % (lbyte, hbyte)
121   cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
122   cgo.add_assignment(targetVar, cgo.expr2py(Adv(Var(CCvar), hlen - n)))
123
124
125
126# Generate a simplest possible test for a Unicode codepoint range
127# such that each 1 bit marks a position within a UTF-8 initial
128# subsequence such that each legal continuation of that subsequence
129# is within the range.  Return the generated variable.
130def utf8_ifrange_compiler(cgo, lo, hi):
131   lo_len = utf8_length(lo)
132   hi_len = utf8_length(hi)
133   # If different length code unit sequences are involved, make
134   # a union of equilength subranges.
135   if hi_len > lo_len:
136     m = max_codepoint_of_length(hi_len - 1)
137     v_lo = utf8_ifrange_compiler(cgo, lo, m)
138     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
139     range_var = "test_%x_%x" % (lo, hi)
140     cgo.add_assignment(range_var, cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
141     return range_var
142   #
143   else:
144     return matched_ifsequence_compiler(cgo, lo, hi, 1, hi_len)
145
146
147def matched_ifsequence_compiler(cgo, lo, hi, n, hlen):
148   """ Helper function to generate the code necessary to match bytes
149       n through hlen (1-based indexing) of the range of utf-8 sequences
150       for codepoints lo through hi. """
151   hbyte = utf8_byte(hi, n)
152   lbyte = utf8_byte(lo, n)
153   if n == hlen:
154     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
155     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
156     return
157   #
158   # One or more bytes of the lower and upper bound may be the same.
159   # Build a sequence of byte tests.
160   if hbyte == lbyte:
161     sfxVar = matched_ifsequence_compiler(cgo, lo, hi, n+1, hlen)
162     targetVar = "bytetest_%x" % (lbyte)
163     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
164     cgo.add_assignment(targetVar, cgo.expr2py(make_and(make_shift_forward(Var(targetVar), 1), Var(sfxVar))))
165     return targetVar
166   # We now have a range involving different bytes at position n.
167   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
168   # A separate test may be needed for the high byte sequence if
169   # there are constraints on following suffix bytes.
170   if hi & following_suffix_mask != following_suffix_mask:
171     hi_floor = hi &~following_suffix_mask
172     hiVar = matched_ifsequence_compiler(cgo, hi_floor, hi, n, hlen)
173     loVar = matched_ifsequence_compiler(cgo, lo, hi_floor - 1, n, hlen)
174     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
175     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
176     return targetVar
177   # A separate test may be needed for the low byte sequence if
178   # there are constraints on following suffix bytes.
179   if lo & following_suffix_mask != 0:
180     low_ceil = lo | following_suffix_mask
181     hiVar = matched_ifsequence_compiler(cgo, low_ceil + 1, hi, n, hlen)
182     loVar = matched_ifsequence_compiler(cgo, lo, low_ceil, n, hlen)
183     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
184     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
185     return targetVar
186   #
187   # Now we have a range that permits all suffix combinations.
188   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
189   # has been validated.
190   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
191   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
192   return targetVar
193
194
195
196
197def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
198  if bytecount == 0: return [make_assign(targetVar, "1")]
199  byte1 = utf8_byte(codepoint, 1)
200  stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
201  byteno = 1
202  while byteno < bytecount:
203    byteno += 1
204    sfx_byte = utf8_byte(codepoint, byteno)
205    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
206  return stmts
207
208def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
209  if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
210  byteno = startbyte
211  while byteno < endbyte:
212    byteno += 1
213    sfx_byte = utf8_byte(codepoint, byteno)
214    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
215  return stmts
216
217import re
218
219Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
220Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
221
222
223def parse_general():
224  category_size = {}
225  category_def = {}
226  f = open("DerivedGeneralCategory.txt")
227  lines = f.readlines()
228  for t in lines:
229    m = Unicode_point_regexp.match(t)
230    if m:
231      point = m.group(1)
232      category = m.group(2)
233      if not category_size.has_key(category):
234        category_size[category] = 0
235        category_def[category] = []
236      pval = int(point, 16)
237      category_def[category].append((pval, pval))
238      category_size[category] += 1     
239    m = Unicode_range_regexp.match(t)
240    if m:
241      point1 = m.group(1)
242      point2 = m.group(2)
243      category = m.group(3)
244      if not category_size.has_key(category):
245        category_size[category] = 0
246        category_def[category] = []
247      pval1 = int(point1, 16)
248      pval2 = int(point2, 16)
249      category_def[category].append((pval1, pval2))
250      category_size[category] += 1
251  return (category_size, category_def)
252  f.close()
253
254
255
256def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
257#   inner_code = []
258   (outer_lo, outer_hi) = enclosingRange
259   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
260   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
261   for rg in missingRanges:
262     (rglo, rghi) = rg
263     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap)
264   topRanges = outerRanges(enclosedRanges)
265   inner = innerRanges(enclosedRanges)
266   for rg in topRanges:
267     (rglo, rghi) = rg
268     inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
269     inner_cgo.add_common_expressions(cgo)
270     generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
271     if inner_cgo.generated_code != []:
272        range_var = utf8_ifrange_compiler(cgo, rglo, rghi)
273        cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
274   return cgo.showcode()
275
276def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
277   for k in charClassMap.keys():
278     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
279     # Divide by UTF-8 length
280     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]:
281        (lo1, hi1) = byte_range
282        subcc2 = rangeIntersect(subcc1, lo1, hi1)
283        ulen = utf8_length(lo1)
284        for subrange in subcc2:
285           (lo2, hi2) = subrange
286           CC_var = "CC_%s_%x_%x" % (k, lo2, hi2)
287           matched_sequence_compiler(cgo, lo2, hi2, 1, ulen, CC_var)
288           cgo.add_assignment("struct_%s.cc" % k, cgo.expr2py(make_or(Var("struct_%s.cc" % k), Var(CC_var))))
289
290def rangeIntersect(ccList, lo, hi):
291    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
292
293def rangeGaps(ccList, lo, hi):
294    if lo >= hi: return []
295    if ccList == []: return [(lo, hi)]
296    (lo1, hi1) = ccList[0]
297    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
298    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
299    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
300    else: return []
301
302def outerRanges(ccList):
303    if len(ccList) <= 1: return ccList
304    (lo1, hi1) = ccList[0]
305    (lo2, hi2) = ccList[1]
306    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
307    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
308
309def innerRanges(ccList):
310    if len(ccList) <= 1: return []
311    (lo1, hi1) = ccList[0]
312    (lo2, hi2) = ccList[1]
313    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
314    else: return innerRanges(ccList[1:])
315
316
317
318def generateCharClassDefs(ifRangeList, charClassMap):
319   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
320   for k in charClassMap.keys():
321     cgo.add_assignment("struct_%s.cc" % k, '0')
322   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
323   return cgo.showcode()
324 
325
326#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
327
328#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
329
330defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x100,0x2FF), 
331(0x300,0x36F), (0x370,0x3FF), (0x400,0x7FF), (0x400,0x4FF),  (0x500, 0x52F), (0x530, 0x58F), (0x590,0x5FF), (0x600,0x6FF), 
332(0x700,0x7FF), (0x700,0x74F), (0x750,0x77F), (0x750,0x77F), (0x780,0x7BF), (0x7C0,0x7FF),
333(0x800,0xFFFF), 
334(0x10000, 0x10FFFF)]
335
336Unicode_CC_struct = "class category_%s:\n\tcc = 0\n\n"
337Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
338Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
339
340def generateDefs1(general_category):
341  catmap = {}
342  catmap[general_category] = UnicodeCategoryMap[general_category]
343  struct = Unicode_CC_struct % (general_category)
344  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
345  code = generateCharClassDefs(defaultIfRangeList, catmap)
346  return struct + header + "".join(code) + Unicode_dummy_main
347
348
349def main():   
350
351    global options, UnicodeCategoryMap
352    # Option definition
353    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
354 
355    option_parser.add_option('-c', '--category',
356                             dest='category',
357                             type='string',
358                             default='Cc',
359                             help='general category; default: Cc',
360                             ) 
361    options, args = option_parser.parse_args(sys.argv[1:])
362
363    (catlen, UnicodeCategoryMap) = parse_general()             
364           
365    if options.category not in UnicodeCategoryMap:
366            #define the characters in the list
367            print "Unknown general category %s" % options.category
368            exit
369    code = generateDefs1(options.category)
370
371    if (len(args) == 1):
372        fh = open(args[0], "w")
373        fh.write(code)
374        fh.close()
375    elif len(args) == 0:
376        print code
377    else:
378        option_parser.print_usage()
379       
380
381if __name__ == "__main__": main()
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
Note: See TracBrowser for help on using the repository browser.