source: proto/charsetcompiler/unicode_category_compiler.py @ 3968

Last change on this file since 3968 was 3968, checked in by cameron, 5 years ago

Bug fixes

File size: 14.5 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from pablo_expr import *
10from CC_compiler import *
11from UTF_encoding import *
12from charset_def import *
13import optparse, sys
14
15#
16# Definitions for debugging/prototyping
17def make_shift_forward(e, n): return Adv(e, n)
18def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
19#
20def utf8_length(codepoint):
21   if codepoint <= 0x7F: return 1
22   elif codepoint <= 0x7FF: return 2
23   elif codepoint <= 0xFFFF: return 3
24   else: return 4
25
26def utf8_byte(codepoint, n):
27   len = utf8_length(codepoint)
28   if n == 1:
29     if len == 1: return codepoint
30     elif len == 2: return 0xC0 + (codepoint >> 6) 
31     elif len == 3: return 0xE0 + (codepoint >> 12) 
32     elif len == 4: return 0xF0 + (codepoint >> 18) 
33   else:
34     bits = (codepoint >> (6 * (len - n))) & 0x3F
35     return 0x80 + bits
36
37def max_codepoint_of_length(n):
38   if n == 1: return 0x7F
39   elif n == 2: return 0x7FF
40   elif n == 3: return 0xFFFF
41   else: return 0x10FFFF
42
43#
44# Given two codepoints lo, hi: return the number of
45# leading UTF-8 bytes that their respective UTF-8
46# representations have in common.
47def common_utf8_leading_bytes(lo, hi):
48   u8len_lo = utf8_length(lo)
49   u8len_hi = utf8_length(hi)
50   if u8len_lo != u8len_hi: return 0
51   remaining = u8len_lo
52   while remaining > 0:
53     if lo == hi: return remaining
54     lo >>= 6
55     hi >>= 6
56     remaining -= 1
57   return 0
58
59def utf8_range_compiler(cgo, lo, hi, targetVar):
60   lo_len = utf8_length(lo)
61   hi_len = utf8_length(hi)
62   # If different length code unit sequences are involved, make
63   # a union of equilength subranges.
64   if hi_len > lo_len:
65     m = max_codepoint_of_length(hi_len - 1)
66     targetV_lo = "%s_%i" % (targetVar, lo_len)   
67     targetV_hi = "%s_%i" % (targetVar, hi_len)
68     utf8_range_compiler(cgo, lo, m, targetV_lo)
69     utf8_range_compiler(cgo, m+1, hi, targetV_hi)
70     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetV_lo), Var(targetV_hi))))
71   #
72   else:
73     matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
74
75def matched_sequence_compiler(cgo, lo, hi, n, hlen, targetVar):
76   """ Helper function to generate the code necessary to match bytes
77       n through hlen (1-based indexing) of the range of utf-8 sequences
78       for codepoints lo through hi. """
79   hbyte = utf8_byte(hi, n)
80   lbyte = utf8_byte(lo, n)
81   if n == hlen:
82     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
83     return
84   #
85   # One or more bytes of the lower and upper bound may be the same.
86   # Build a sequence of byte tests.
87   if hbyte == lbyte:
88     sfxVar = targetVar + "_sfx"
89     matched_sequence_compiler(cgo, lo, hi, n+1, hlen, sfxVar)
90     CCvar = "CC_%x" % (hbyte)
91     cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
92     cgo.add_assignment(targetVar, cgo.expr2py(make_and(make_shift_forward(Var(CCvar), 1), Var(sfxVar))))
93     return
94   # We now have a range involving different bytes at position n.
95   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
96   # A separate test may be needed for the high byte sequence if
97   # there are constraints on following suffix bytes.
98   if hi & following_suffix_mask != following_suffix_mask:
99     hi_floor = hi &~following_suffix_mask
100     hiVar = targetVar + "_hi"
101     loVar = targetVar + "_lo"
102     matched_sequence_compiler(cgo, hi_floor, hi, n, hlen, hiVar)
103     matched_sequence_compiler(cgo, lo, hi_floor - 1, n, hlen, loVar)
104     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
105     return
106   # A separate test may be needed for the low byte sequence if
107   # there are constraints on following suffix bytes.
108   if lo & following_suffix_mask != 0:
109     low_ceil = lo | following_suffix_mask
110     hiVar = targetVar + "_hi"
111     loVar = targetVar + "_lo"
112     matched_sequence_compiler(cgo, low_ceil + 1, hi, n, hlen, hiVar)
113     matched_sequence_compiler(cgo, lo, low_ceil, n, hlen, loVar)
114     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
115     return 
116   #
117   # Now we have a range that permits all suffix combinations.
118   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
119   # has been validated.
120   CCvar = "CC_%x_%x" % (lbyte, hbyte)
121   cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
122   cgo.add_assignment(targetVar, cgo.expr2py(Adv(Var(CCvar), hlen - n)))
123
124
125
126# Generate a simplest possible test for a Unicode codepoint range
127# such that each 1 bit marks a position within a UTF-8 initial
128# subsequence such that each legal continuation of that subsequence
129# is within the range.  Return the generated variable.
130def utf8_ifrange_compiler(cgo, lo, hi):
131   lo_len = utf8_length(lo)
132   hi_len = utf8_length(hi)
133   # If different length code unit sequences are involved, make
134   # a union of equilength subranges.
135   if hi_len > lo_len:
136     m = max_codepoint_of_length(hi_len - 1)
137     v_lo = utf8_ifrange_compiler(cgo, lo, m)
138     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
139     range_var = "test_%x_%x" % (lo, hi)
140     cgo.add_assignment(range_var, cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
141     return range_var
142   #
143   else:
144     return matched_ifsequence_compiler(cgo, lo, hi, 1, hi_len)
145
146
147def matched_ifsequence_compiler(cgo, lo, hi, n, hlen):
148   """ Helper function to generate the code necessary to match bytes
149       n through hlen (1-based indexing) of the range of utf-8 sequences
150       for codepoints lo through hi. """
151   hbyte = utf8_byte(hi, n)
152   lbyte = utf8_byte(lo, n)
153   if n == hlen:
154     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
155     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
156     return targetVar
157   #
158   # One or more bytes of the lower and upper bound may be the same.
159   # Build a sequence of byte tests.
160   if hbyte == lbyte:
161     sfxVar = matched_ifsequence_compiler(cgo, lo, hi, n+1, hlen)
162     targetVar = "bytetest_%x" % (lbyte)
163     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
164     var2 = targetVar+"_adv"
165     cgo.add_assignment(var2, cgo.expr2py(make_and(make_shift_forward(Var(targetVar), 1), Var(sfxVar))))
166     return var2
167   # We now have a range involving different bytes at position n.
168   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
169   # A separate test may be needed for the high byte sequence if
170   # there are constraints on following suffix bytes.
171   if hi & following_suffix_mask != following_suffix_mask:
172     hi_floor = hi &~following_suffix_mask
173     hiVar = matched_ifsequence_compiler(cgo, hi_floor, hi, n, hlen)
174     loVar = matched_ifsequence_compiler(cgo, lo, hi_floor - 1, n, hlen)
175     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
176     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
177     return targetVar
178   # A separate test may be needed for the low byte sequence if
179   # there are constraints on following suffix bytes.
180   if lo & following_suffix_mask != 0:
181     low_ceil = lo | following_suffix_mask
182     hiVar = matched_ifsequence_compiler(cgo, low_ceil + 1, hi, n, hlen)
183     loVar = matched_ifsequence_compiler(cgo, lo, low_ceil, n, hlen)
184     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
185     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
186     return targetVar
187   #
188   # Now we have a range that permits all suffix combinations.
189   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
190   # has been validated.
191   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
192   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
193   return targetVar
194
195
196
197
198def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
199  if bytecount == 0: return [make_assign(targetVar, "1")]
200  byte1 = utf8_byte(codepoint, 1)
201  stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
202  byteno = 1
203  while byteno < bytecount:
204    byteno += 1
205    sfx_byte = utf8_byte(codepoint, byteno)
206    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
207  return stmts
208
209def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
210  if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
211  byteno = startbyte
212  while byteno < endbyte:
213    byteno += 1
214    sfx_byte = utf8_byte(codepoint, byteno)
215    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
216  return stmts
217
218import re
219
220Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
221Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
222
223
224def parse_general():
225  category_size = {}
226  category_def = {}
227  f = open("DerivedGeneralCategory.txt")
228  lines = f.readlines()
229  for t in lines:
230    m = Unicode_point_regexp.match(t)
231    if m:
232      point = m.group(1)
233      category = m.group(2)
234      if not category_size.has_key(category):
235        category_size[category] = 0
236        category_def[category] = []
237      pval = int(point, 16)
238      category_def[category].append((pval, pval))
239      category_size[category] += 1     
240    m = Unicode_range_regexp.match(t)
241    if m:
242      point1 = m.group(1)
243      point2 = m.group(2)
244      category = m.group(3)
245      if not category_size.has_key(category):
246        category_size[category] = 0
247        category_def[category] = []
248      pval1 = int(point1, 16)
249      pval2 = int(point2, 16)
250      category_def[category].append((pval1, pval2))
251      category_size[category] += 1
252  return (category_size, category_def)
253  f.close()
254
255
256
257def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
258#   inner_code = []
259   (outer_lo, outer_hi) = enclosingRange
260   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
261   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
262   for rg in missingRanges:
263     (rglo, rghi) = rg
264     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap)
265   topRanges = outerRanges(enclosedRanges)
266   inner = innerRanges(enclosedRanges)
267   for rg in topRanges:
268     (rglo, rghi) = rg
269     inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
270     inner_cgo.add_common_expressions(cgo)
271     generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
272     if inner_cgo.generated_code != []:
273        range_var = utf8_ifrange_compiler(cgo, rglo, rghi)
274        cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
275   return cgo.showcode()
276
277def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
278   for k in charClassMap.keys():
279     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
280     # Divide by UTF-8 length
281     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]:
282        (lo1, hi1) = byte_range
283        subcc2 = rangeIntersect(subcc1, lo1, hi1)
284        ulen = utf8_length(lo1)
285        for subrange in subcc2:
286           (lo2, hi2) = subrange
287           CC_var = "CC_%s_%x_%x" % (k, lo2, hi2)
288           matched_sequence_compiler(cgo, lo2, hi2, 1, ulen, CC_var)
289           cgo.add_assignment("struct_%s.cc" % k, cgo.expr2py(make_or(Var("struct_%s.cc" % k), Var(CC_var))))
290
291def rangeIntersect(ccList, lo, hi):
292    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
293
294def rangeGaps(ccList, lo, hi):
295    if lo >= hi: return []
296    if ccList == []: return [(lo, hi)]
297    (lo1, hi1) = ccList[0]
298    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
299    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
300    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
301    else: return []
302
303def outerRanges(ccList):
304    if len(ccList) <= 1: return ccList
305    (lo1, hi1) = ccList[0]
306    (lo2, hi2) = ccList[1]
307    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
308    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
309
310def innerRanges(ccList):
311    if len(ccList) <= 1: return []
312    (lo1, hi1) = ccList[0]
313    (lo2, hi2) = ccList[1]
314    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
315    else: return innerRanges(ccList[1:])
316
317
318
319def generateCharClassDefs(ifRangeList, charClassMap):
320   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
321   for k in charClassMap.keys():
322     cgo.add_assignment("struct_%s.cc" % k, '0')
323   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
324   return cgo.showcode()
325 
326
327#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
328
329#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
330
331defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x100,0x2FF), 
332(0x300,0x36F), (0x370,0x3FF), (0x400,0x7FF), (0x400,0x4FF),  (0x500, 0x52F), (0x530, 0x58F), (0x590,0x5FF), (0x600,0x6FF), 
333(0x700,0x7FF), (0x700,0x74F), (0x750,0x77F), (0x750,0x77F), (0x780,0x7BF), (0x7C0,0x7FF),
334(0x800,0xFFFF), 
335(0x10000, 0x10FFFF)]
336
337Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
338Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
339Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
340
341def generateDefs1(general_category):
342  catmap = {}
343  catmap[general_category] = UnicodeCategoryMap[general_category]
344  struct = Unicode_CC_struct % (general_category)
345  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
346  code = generateCharClassDefs(defaultIfRangeList, catmap)
347  return struct + header + "".join(code)
348
349
350def main():   
351
352    global options, UnicodeCategoryMap
353    # Option definition
354    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
355 
356    option_parser.add_option('-c', '--category',
357                             dest='category',
358                             type='string',
359                             default='Cc',
360                             help='general category; default: Cc',
361                             )
362    options, args = option_parser.parse_args(sys.argv[1:])
363
364    (catlen, UnicodeCategoryMap) = parse_general()
365   
366    code = ""
367    if options.category == '.':
368        for k in UnicodeCategoryMap.keys():
369            code += generateDefs1(k)
370   
371    else:
372        if options.category not in UnicodeCategoryMap:
373            #define the characters in the list
374            print "Unknown general category %s" % options.category
375            exit
376        code = generateDefs1(options.category)
377
378    code += Unicode_dummy_main
379
380    if (len(args) == 1):
381        fh = open(args[0], "w")
382        fh.write(code)
383        fh.close()
384    elif len(args) == 0:
385        print code
386    else:
387        option_parser.print_usage()
388       
389
390if __name__ == "__main__": main()
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
Note: See TracBrowser for help on using the repository browser.