source: proto/charsetcompiler/uniscript.py @ 4457

Last change on this file since 4457 was 3973, checked in by cameron, 5 years ago

uniscript.py working script

File size: 14.4 KB
Line 
1#
2# Unicode category scripting.
3#
4
5from pablo_expr import *
6from CC_compiler import *
7from UTF_encoding import *
8from charset_def import *
9import optparse, sys
10import string
11
12#
13# Definitions for debugging/prototyping
14def make_shift_forward(e, n): return Adv(e, n)
15def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
16#
17def utf8_length(codepoint):
18   if codepoint <= 0x7F: return 1
19   elif codepoint <= 0x7FF: return 2
20   elif codepoint <= 0xFFFF: return 3
21   else: return 4
22
23def utf8_byte(codepoint, n):
24   len = utf8_length(codepoint)
25   if n == 1:
26     if len == 1: return codepoint
27     elif len == 2: return 0xC0 + (codepoint >> 6) 
28     elif len == 3: return 0xE0 + (codepoint >> 12) 
29     elif len == 4: return 0xF0 + (codepoint >> 18) 
30   else:
31     bits = (codepoint >> (6 * (len - n))) & 0x3F
32     return 0x80 + bits
33
34def max_codepoint_of_length(n):
35   if n == 1: return 0x7F
36   elif n == 2: return 0x7FF
37   elif n == 3: return 0xFFFF
38   else: return 0x10FFFF
39
40#
41# Given two codepoints lo, hi: return the number of
42# leading UTF-8 bytes that their respective UTF-8
43# representations have in common.
44def common_utf8_leading_bytes(lo, hi):
45   u8len_lo = utf8_length(lo)
46   u8len_hi = utf8_length(hi)
47   if u8len_lo != u8len_hi: return 0
48   remaining = u8len_lo
49   while remaining > 0:
50     if lo == hi: return remaining
51     lo >>= 6
52     hi >>= 6
53     remaining -= 1
54   return 0
55
56def utf8_range_compiler(cgo, lo, hi, targetVar):
57   lo_len = utf8_length(lo)
58   hi_len = utf8_length(hi)
59   # If different length code unit sequences are involved, make
60   # a union of equilength subranges.
61   if hi_len > lo_len:
62     m = max_codepoint_of_length(hi_len - 1)
63     targetV_lo = "%s_%i" % (targetVar, lo_len)   
64     targetV_hi = "%s_%i" % (targetVar, hi_len)
65     utf8_range_compiler(cgo, lo, m, targetV_lo)
66     utf8_range_compiler(cgo, m+1, hi, targetV_hi)
67     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetV_lo), Var(targetV_hi))))
68   #
69   else:
70     matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
71
72def matched_sequence_compiler(cgo, lo, hi, n, hlen, targetVar):
73   """ Helper function to generate the code necessary to match bytes
74       n through hlen (1-based indexing) of the range of utf-8 sequences
75       for codepoints lo through hi. """
76   hbyte = utf8_byte(hi, n)
77   lbyte = utf8_byte(lo, n)
78   if n == hlen:
79     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
80     return
81   #
82   # One or more bytes of the lower and upper bound may be the same.
83   # Build a sequence of byte tests.
84   if hbyte == lbyte:
85     sfxVar = targetVar + "_sfx"
86     matched_sequence_compiler(cgo, lo, hi, n+1, hlen, sfxVar)
87     CCvar = "CC_%x" % (hbyte)
88     cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
89     cgo.add_assignment(targetVar, cgo.expr2py(make_and(make_shift_forward(Var(CCvar), 1), Var(sfxVar))))
90     return
91   # We now have a range involving different bytes at position n.
92   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
93   # A separate test may be needed for the high byte sequence if
94   # there are constraints on following suffix bytes.
95   if hi & following_suffix_mask != following_suffix_mask:
96     hi_floor = hi &~following_suffix_mask
97     hiVar = targetVar + "_hi"
98     loVar = targetVar + "_lo"
99     matched_sequence_compiler(cgo, hi_floor, hi, n, hlen, hiVar)
100     matched_sequence_compiler(cgo, lo, hi_floor - 1, n, hlen, loVar)
101     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
102     return
103   # A separate test may be needed for the low byte sequence if
104   # there are constraints on following suffix bytes.
105   if lo & following_suffix_mask != 0:
106     low_ceil = lo | following_suffix_mask
107     hiVar = targetVar + "_hi"
108     loVar = targetVar + "_lo"
109     matched_sequence_compiler(cgo, low_ceil + 1, hi, n, hlen, hiVar)
110     matched_sequence_compiler(cgo, lo, low_ceil, n, hlen, loVar)
111     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
112     return 
113   #
114   # Now we have a range that permits all suffix combinations.
115   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
116   # has been validated.
117   CCvar = "CC_%x_%x" % (lbyte, hbyte)
118   cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
119   cgo.add_assignment(targetVar, cgo.expr2py(Adv(Var(CCvar), hlen - n)))
120
121
122
123# Generate a simplest possible test for a Unicode codepoint range
124# such that each 1 bit marks a position within a UTF-8 initial
125# subsequence such that each legal continuation of that subsequence
126# is within the range.  Return the generated variable.
127def utf8_ifrange_compiler(cgo, lo, hi):
128   lo_len = utf8_length(lo)
129   hi_len = utf8_length(hi)
130   # If different length code unit sequences are involved, make
131   # a union of equilength subranges.
132   if hi_len > lo_len:
133     m = max_codepoint_of_length(hi_len - 1)
134     v_lo = utf8_ifrange_compiler(cgo, lo, m)
135     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
136     range_var = "test_%x_%x" % (lo, hi)
137     cgo.add_assignment(range_var, cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
138     return range_var
139   #
140   else:
141     return matched_ifsequence_compiler(cgo, lo, hi, 1, hi_len)
142
143
144def matched_ifsequence_compiler(cgo, lo, hi, n, hlen):
145   """ Helper function to generate the code necessary to match bytes
146       n through hlen (1-based indexing) of the range of utf-8 sequences
147       for codepoints lo through hi. """
148   hbyte = utf8_byte(hi, n)
149   lbyte = utf8_byte(lo, n)
150   if n == hlen:
151     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
152     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
153     return targetVar
154   #
155   # One or more bytes of the lower and upper bound may be the same.
156   # Build a sequence of byte tests.
157   if hbyte == lbyte:
158     sfxVar = matched_ifsequence_compiler(cgo, lo, hi, n+1, hlen)
159     targetVar = "bytetest_%x" % (lbyte)
160     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
161     var2 = targetVar+"_adv"
162     cgo.add_assignment(var2, cgo.expr2py(make_and(make_shift_forward(Var(targetVar), 1), Var(sfxVar))))
163     return var2
164   # We now have a range involving different bytes at position n.
165   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
166   # A separate test may be needed for the high byte sequence if
167   # there are constraints on following suffix bytes.
168   if hi & following_suffix_mask != following_suffix_mask:
169     hi_floor = hi &~following_suffix_mask
170     hiVar = matched_ifsequence_compiler(cgo, hi_floor, hi, n, hlen)
171     loVar = matched_ifsequence_compiler(cgo, lo, hi_floor - 1, n, hlen)
172     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
173     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
174     return targetVar
175   # A separate test may be needed for the low byte sequence if
176   # there are constraints on following suffix bytes.
177   if lo & following_suffix_mask != 0:
178     low_ceil = lo | following_suffix_mask
179     hiVar = matched_ifsequence_compiler(cgo, low_ceil + 1, hi, n, hlen)
180     loVar = matched_ifsequence_compiler(cgo, lo, low_ceil, n, hlen)
181     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
182     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
183     return targetVar
184   #
185   # Now we have a range that permits all suffix combinations.
186   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
187   # has been validated.
188   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
189   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
190   return targetVar
191
192
193
194
195def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
196  if bytecount == 0: return [make_assign(targetVar, "1")]
197  byte1 = utf8_byte(codepoint, 1)
198  stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
199  byteno = 1
200  while byteno < bytecount:
201    byteno += 1
202    sfx_byte = utf8_byte(codepoint, byteno)
203    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
204  return stmts
205
206def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
207  if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
208  byteno = startbyte
209  while byteno < endbyte:
210    byteno += 1
211    sfx_byte = utf8_byte(codepoint, byteno)
212    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
213  return stmts
214
215import re
216
217Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
218Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
219
220
221def parse_general():
222  category_size = {}
223  category_def = {}
224  f = open("DerivedGeneralCategory.txt")
225  lines = f.readlines()
226  for t in lines:
227    m = Unicode_point_regexp.match(t)
228    if m:
229      point = m.group(1)
230      category = m.group(2)
231      if not category_size.has_key(category):
232        category_size[category] = 0
233        category_def[category] = []
234      pval = int(point, 16)
235      category_def[category].append((pval, pval))
236      category_size[category] += 1     
237    m = Unicode_range_regexp.match(t)
238    if m:
239      point1 = m.group(1)
240      point2 = m.group(2)
241      category = m.group(3)
242      if not category_size.has_key(category):
243        category_size[category] = 0
244        category_def[category] = []
245      pval1 = int(point1, 16)
246      pval2 = int(point2, 16)
247      category_def[category].append((pval1, pval2))
248      category_size[category] += 1
249  return (category_size, category_def)
250  f.close()
251
252
253
254def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
255#   inner_code = []
256   (outer_lo, outer_hi) = enclosingRange
257   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
258   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
259   for rg in missingRanges:
260     (rglo, rghi) = rg
261     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap)
262   topRanges = outerRanges(enclosedRanges)
263   inner = innerRanges(enclosedRanges)
264   for rg in topRanges:
265     (rglo, rghi) = rg
266     inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
267     inner_cgo.add_common_expressions(cgo)
268     generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
269     if inner_cgo.generated_code != []:
270        range_var = utf8_ifrange_compiler(cgo, rglo, rghi)
271        cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
272   return cgo.showcode()
273
274def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
275   for k in charClassMap.keys():
276     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
277     # Divide by UTF-8 length
278     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]:
279        (lo1, hi1) = byte_range
280        subcc2 = rangeIntersect(subcc1, lo1, hi1)
281        ulen = utf8_length(lo1)
282        for subrange in subcc2:
283           (lo2, hi2) = subrange
284           CC_var = "CC_%s_%x_%x" % (k, lo2, hi2)
285           matched_sequence_compiler(cgo, lo2, hi2, 1, ulen, CC_var)
286           cgo.add_assignment("struct_%s.cc" % k, cgo.expr2py(make_or(Var("struct_%s.cc" % k), Var(CC_var))))
287
288def rangeIntersect(ccList, lo, hi):
289    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
290
291def rangeGaps(ccList, lo, hi):
292    if lo >= hi: return []
293    if ccList == []: return [(lo, hi)]
294    (lo1, hi1) = ccList[0]
295    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
296    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
297    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
298    else: return []
299
300def outerRanges(ccList):
301    if len(ccList) <= 1: return ccList
302    (lo1, hi1) = ccList[0]
303    (lo2, hi2) = ccList[1]
304    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
305    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
306
307def innerRanges(ccList):
308    if len(ccList) <= 1: return []
309    (lo1, hi1) = ccList[0]
310    (lo2, hi2) = ccList[1]
311    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
312    else: return innerRanges(ccList[1:])
313
314
315
316def generateCharClassDefs(ifRangeList, charClassMap):
317   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
318   for k in charClassMap.keys():
319     cgo.add_assignment("struct_%s.cc" % k, '0')
320   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
321   return cgo.showcode()
322 
323
324#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
325
326#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
327
328defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x100,0x2FF), 
329(0x300,0x36F), (0x370,0x3FF), (0x400,0x7FF), (0x400,0x4FF),  (0x500, 0x52F), (0x530, 0x58F), (0x590,0x5FF), (0x600,0x6FF), 
330(0x700,0x7FF), (0x700,0x74F), (0x750,0x77F), (0x750,0x77F), (0x780,0x7BF), (0x7C0,0x7FF),
331(0x800,0xFFFF), 
332(0x10000, 0x10FFFF)]
333
334Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
335Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
336Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
337
338def generateDefs1(general_category):
339  catmap = {}
340  catmap[general_category] = UnicodeCategoryMap[general_category]
341  struct = Unicode_CC_struct % (general_category)
342  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
343  code = generateCharClassDefs(defaultIfRangeList, catmap)
344  return struct + header + "".join(code)
345
346
347wrapper_template =r"""
348extern "C" {
349    BitBlock wrapped_get_category_%s(Basis_bits &basis_bits, const char* name){
350
351        Struct_%s %s_output;
352        %s.do_block(basis_bits, %s_output);
353
354        return %s_output.cc;
355    }
356}
357"""
358
359def generateWrapper():
360  catmap = {}
361  wrapper_code = ""
362  for k in UnicodeCategoryMap.keys():
363    l = k.lower()
364    wrapper_code += wrapper_template % (k, k, l, l, l, l)
365  return wrapper_code
366
367
368
369
370
371def main():   
372
373    global options, UnicodeCategoryMap
374    # Option definition
375    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
376 
377    option_parser.add_option('-c', '--category',
378                             dest='category',
379                             type='string',
380                             default='Cc',
381                             help='general category; default: Cc',
382                             )
383    options, args = option_parser.parse_args(sys.argv[1:])
384
385    (catlen, UnicodeCategoryMap) = parse_general()
386   
387    code = ""
388    code = generateWrapper()
389
390
391    if (len(args) == 1):
392        fh = open(args[0], "w")
393        fh.write(code)
394        fh.close()
395    elif len(args) == 0:
396        print code
397    else:
398        option_parser.print_usage()
399       
400
401if __name__ == "__main__": main()
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
Note: See TracBrowser for help on using the repository browser.