source: proto/charsetcompiler/unicode_category_compiler.py @ 3954

Last change on this file since 3954 was 3954, checked in by cameron, 5 years ago

Unicode category compiler main program: unicode_charset_compiler.py

File size: 9.7 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from pablo_expr import *
10from CC_compiler import *
11from UTF_encoding import *
12from charset_def import *
13import optparse, sys
14
15#
16# Definitions for debugging/prototyping
17def make_shift_forward(e, n): return Adv(e, n)
18def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
19#
20def utf8_length(codepoint):
21   if codepoint <= 0x7F: return 1
22   elif codepoint <= 0x7FF: return 2
23   elif codepoint <= 0xFFFF: return 3
24   else: return 4
25
26def utf8_byte(codepoint, n):
27   len = utf8_length(codepoint)
28   if n == 1:
29     if len == 1: return codepoint
30     elif len == 2: return 0xC0 + (codepoint >> 6) 
31     elif len == 3: return 0xE0 + (codepoint >> 12) 
32     elif len == 4: return 0xF0 + (codepoint >> 18) 
33   else:
34     bits = (codepoint >> (6 * (len - n))) & 0x3F
35     return 0x80 + bits
36
37def max_codepoint_of_length(n):
38   if n == 1: return 0x7F
39   elif n == 2: return 0x7FF
40   elif n == 3: return 0xFFFF
41   else: return 0x10FFFF
42
43#
44# Given two codepoints lo, hi: return the number of
45# leading UTF-8 bytes that their respective UTF-8
46# representations have in common.
47def common_utf8_leading_bytes(lo, hi):
48   u8len_lo = utf8_length(lo)
49   u8len_hi = utf8_length(hi)
50   if u8len_lo != u8len_hi: return 0
51   remaining = u8len_lo
52   while remaining > 0:
53     if lo == hi: return remaining
54     lo >>= 6
55     hi >>= 6
56     remaining -= 1
57   return 0
58
59def utf8_range_compiler(cgo, lo, hi, targetVar):
60   lo_len = utf8_length(lo)
61   hi_len = utf8_length(hi)
62   # If different length code unit sequences are involved, make
63   # a union of equilength subranges.
64   if hi_len > lo_len:
65     m = max_codepoint_of_length(hi_len - 1)
66     targetV_lo = "%s_%i" % (targetVar, lo_len)   
67     targetV_hi = "%s_%i" % (targetVar, hi_len)
68     utf8_range_compiler(cgo, lo, m, targetV_lo)
69     utf8_range_compiler(cgo, m+1, hi, targetV_hi)
70     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetV_lo), Var(targetV_hi))))
71   #
72   else:
73     matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
74
75def matched_sequence_compiler(cgo, lo, hi, n, hlen, targetVar):
76   """ Helper function to generate the code necessary to match bytes
77       n through hlen (1-based indexing) of the range of utf-8 sequences
78       for codepoints lo through hi. """
79   hbyte = utf8_byte(hi, n)
80   lbyte = utf8_byte(lo, n)
81   if n == hlen:
82     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
83     return
84   #
85   # One or more bytes of the lower and upper bound may be the same.
86   # Build a sequence of byte tests.
87   if hbyte == lbyte:
88     sfxVar = targetVar + "_sfx"
89     matched_sequence_compiler(cgo, lo, hi, n+1, hlen, sfxVar)
90     CCvar = "CC_%x" % (hbyte)
91     cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
92     cgo.add_assignment(targetVar, cgo.expr2py(make_and(make_shift_forward(Var(CCvar), 1), Var(sfxVar))))
93     return
94   # We now have a range involving different bytes at position n.
95   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
96   # A separate test may be needed for the high byte sequence if
97   # there are constraints on following suffix bytes.
98   if hi & following_suffix_mask != following_suffix_mask:
99     hi_floor = hi &~following_suffix_mask
100     hiVar = targetVar + "_hi"
101     loVar = targetVar + "_lo"
102     matched_sequence_compiler(cgo, hi_floor, hi, n, hlen, hiVar)
103     matched_sequence_compiler(cgo, lo, hi_floor - 1, n, hlen, loVar)
104     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
105     return
106   # A separate test may be needed for the low byte sequence if
107   # there are constraints on following suffix bytes.
108   if lo & following_suffix_mask != 0:
109     low_ceil = lo | following_suffix_mask
110     hiVar = targetVar + "_hi"
111     loVar = targetVar + "_lo"
112     matched_sequence_compiler(cgo, low_ceil + 1, hi, n, hlen, hiVar)
113     matched_sequence_compiler(cgo, lo, low_ceil, n, hlen, loVar)
114     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
115     return 
116   #
117   # Now we have a range that permits all suffix combinations.
118   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
119   # has been validated.
120   CCvar = "CC_%x_%x" % (lbyte, hbyte)
121   cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
122   cgo.add_assignment(targetVar, cgo.expr2py(Adv(Var(CCvar), hlen - n)))
123
124
125def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
126  if bytecount == 0: return [make_assign(targetVar, "1")]
127  byte1 = utf8_byte(codepoint, 1)
128  stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
129  byteno = 1
130  while byteno < bytecount:
131    byteno += 1
132    sfx_byte = utf8_byte(codepoint, byteno)
133    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
134  return stmts
135
136def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
137  if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
138  byteno = startbyte
139  while byteno < endbyte:
140    byteno += 1
141    sfx_byte = utf8_byte(codepoint, byteno)
142    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
143  return stmts
144
145import re
146
147Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
148Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
149
150
151def parse_general():
152  category_size = {}
153  category_def = {}
154  f = open("DerivedGeneralCategory.txt")
155  lines = f.readlines()
156  for t in lines:
157    m = Unicode_point_regexp.match(t)
158    if m:
159      point = m.group(1)
160      category = m.group(2)
161      if not category_size.has_key(category):
162        category_size[category] = 0
163        category_def[category] = []
164      pval = int(point, 16)
165      category_def[category].append((pval, pval))
166      category_size[category] += 1     
167    m = Unicode_range_regexp.match(t)
168    if m:
169      point1 = m.group(1)
170      point2 = m.group(2)
171      category = m.group(3)
172      if not category_size.has_key(category):
173        category_size[category] = 0
174        category_def[category] = []
175      pval1 = int(point1, 16)
176      pval2 = int(point2, 16)
177      category_def[category].append((pval1, pval2))
178      category_size[category] += 1
179  return (category_size, category_def)
180  f.close()
181
182
183def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
184#   inner_code = []
185   (outer_lo, outer_hi) = enclosingRange
186   while ifRangeList!=[]:
187     (lo, hi) = ifRangeList[0]
188     if lo >= outer_hi: break
189     if hi > outer_hi: raise Exception("Bad range nested (%i, %i) within (%i, %i)\n" % (lo, hi, outer_lo, outer_hi))
190     # We have more subranges of the enclosing range
191     if lo > outer_lo:
192       # An innermost nest, not further embedded.
193       generateCharClassSubDefs(cgo, outer_lo, lo-1, charClassMap)
194     ifRangeList = ifRangeList[1:]
195     range_var = "CC_%x_%x" % (lo, hi)
196     utf8_range_compiler(cgo, lo, hi, range_var)
197     inner_cgo = CC_compiler(UTF8(), range_var + '_tmp%i', False, '')
198     generateCharClassDefsInIfHierarchy(inner_cgo, (lo, hi), ifRangeList, charClassMap)
199     cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
200     outer_lo = hi + 1
201   # Final innermost_nest
202   if outer_lo <= outer_hi:
203     generateCharClassSubDefs(cgo, outer_lo, outer_hi, charClassMap)
204   return cgo.showcode()
205
206def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
207   for k in charClassMap.keys():
208     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
209     CC_var = "CC_%s_%x_%x" % (k, lo, hi)
210     cgo.chardef2py(CanonicalCharSetDef(CC_var, subcc1))
211     cgo.add_assignment("struct_%s.cc" % k, cgo.expr2py(make_or(Var("struct_%s.cc" % k), Var(CC_var))))
212
213def rangeIntersect(ccList, lo, hi):
214    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
215
216def generateCharClassDefs(ifRangeList, charClassMap):
217   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
218   for k in charClassMap.keys():
219     cgo.add_assignment("struct_%s.cc" % k, '0')
220   return generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
221
222
223defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
224
225
226Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
227Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
228
229def generateDefs1(general_category):
230  catmap = {}
231  catmap[general_category] = UnicodeCategoryMap[general_category]
232  struct = Unicode_CC_struct % (general_category)
233  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
234  code = generateCharClassDefs(defaultIfRangeList, catmap)
235  return struct + header + "".join(code)
236
237
238def main():   
239
240    global options, UnicodeCategoryMap
241    # Option definition
242    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
243 
244    option_parser.add_option('-c', '--category',
245                             dest='category',
246                             type='string',
247                             default='Cc',
248                             help='general category; default: Cc',
249                             ) 
250    options, args = option_parser.parse_args(sys.argv[1:])
251
252    (catlen, UnicodeCategoryMap) = parse_general()             
253           
254    if options.category not in UnicodeCategoryMap:
255            #define the characters in the list
256            print "Unknown general category %s" % options.category
257            exit
258    code = generateDefs1(options.category)
259
260    if (len(args) == 1):
261        fh = open(args[0], "w")
262        fh.write(code)
263        fh.close()
264    elif len(args) == 0:
265        print code
266    else:
267        option_parser.print_usage()
268       
269
270if __name__ == "__main__": main()
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
Note: See TracBrowser for help on using the repository browser.