source: proto/charsetcompiler/unicode_category_compiler.py @ 3987

Last change on this file since 3987 was 3987, checked in by cameron, 5 years ago

Filter newlines for catgrep

File size: 16.9 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from pablo_expr import *
10from CC_compiler import *
11from UTF_encoding import *
12from charset_def import *
13import optparse, sys
14
15#
16# Definitions for debugging/prototyping
17def make_shift_forward(e, n): return Adv(e, n)
18def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
19#
20def utf8_length(codepoint):
21   if codepoint <= 0x7F: return 1
22   elif codepoint <= 0x7FF: return 2
23   elif codepoint <= 0xFFFF: return 3
24   else: return 4
25
26def utf8_byte(codepoint, n):
27   len = utf8_length(codepoint)
28   if n == 1:
29     if len == 1: return codepoint
30     elif len == 2: return 0xC0 + (codepoint >> 6) 
31     elif len == 3: return 0xE0 + (codepoint >> 12) 
32     elif len == 4: return 0xF0 + (codepoint >> 18) 
33   else:
34     bits = (codepoint >> (6 * (len - n))) & 0x3F
35     return 0x80 + bits
36
37def max_codepoint_of_length(n):
38   if n == 1: return 0x7F
39   elif n == 2: return 0x7FF
40   elif n == 3: return 0xFFFF
41   else: return 0x10FFFF
42
43#
44# Given two codepoints lo, hi: return the number of
45# leading UTF-8 bytes that their respective UTF-8
46# representations have in common.
47def common_utf8_leading_bytes(lo, hi):
48   u8len_lo = utf8_length(lo)
49   u8len_hi = utf8_length(hi)
50   if u8len_lo != u8len_hi: return 0
51   remaining = u8len_lo
52   while remaining > 0:
53     if lo == hi: return remaining
54     lo >>= 6
55     hi >>= 6
56     remaining -= 1
57   return 0
58
59def utf8_range_compiler(cgo, lo, hi, targetVar):
60   lo_len = utf8_length(lo)
61   hi_len = utf8_length(hi)
62   # If different length code unit sequences are involved, make
63   # a union of equilength subranges.
64   if hi_len > lo_len:
65     m = max_codepoint_of_length(hi_len - 1)
66     targetV_lo = "%s_%i" % (targetVar, lo_len)   
67     targetV_hi = "%s_%i" % (targetVar, hi_len)
68     utf8_range_compiler(cgo, lo, m, targetV_lo)
69     utf8_range_compiler(cgo, m+1, hi, targetV_hi)
70     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetV_lo), Var(targetV_hi))))
71   #
72   else:
73     matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
74
75
76def matched_sequence_compiler(cgo, lo, hi, hlen):
77   return matched_sequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
78
79def matched_sequence_helper(cgo, lo, hi, prefix, n, hlen):
80   """ Helper function to generate the code necessary to match bytes
81       n through hlen (1-based indexing) of the range of utf-8 sequences
82       for codepoints lo through hi. """
83   hbyte = utf8_byte(hi, n)
84   lbyte = utf8_byte(lo, n)
85   if n == hlen:
86     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
87     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
88     if n == 1: return targetVar
89     return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
90   #
91   # One or more bytes of the lower and upper bound may be the same.
92   # Build a sequence of byte tests.
93   if hbyte == lbyte:
94     targetVar = "bytetest_%x" % (lbyte)
95     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
96     return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
97   # We now have a range involving different bytes at position n.
98   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
99   # A separate test may be needed for the high byte sequence if
100   # there are constraints on following suffix bytes.
101   if hi & following_suffix_mask != following_suffix_mask:
102     hi_floor = hi &~following_suffix_mask     
103     hiVar = matched_sequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
104     loVar = matched_sequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
105     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
106   # A separate test may be needed for the low byte sequence if
107   # there are constraints on following suffix bytes.
108   if lo & following_suffix_mask != 0:
109     low_ceil = lo | following_suffix_mask
110     hiVar = matched_sequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
111     loVar = matched_sequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
112     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
113   #
114   # Now we have a range that permits all suffix combinations.
115   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
116   # has been validated.
117   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
118   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
119   if n == 1: return targetVar
120   return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
121
122
123def matched_ifsequence_compiler(cgo, lo, hi, hlen):
124   return matched_ifsequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
125
126def matched_ifsequence_helper(cgo, lo, hi, prefix, n, hlen):
127   """ Helper function to generate the code necessary to match bytes
128       n through hlen (1-based indexing) of the range of utf-8 sequences
129       for codepoints lo through hi. """
130   hbyte = utf8_byte(hi, n)
131   lbyte = utf8_byte(lo, n)
132   if n == hlen:
133     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
134     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
135     if n == 1: return targetVar
136     else: return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
137   #
138   # One or more bytes of the lower and upper bound may be the same.
139   # Build a sequence of byte tests.
140   if hbyte == lbyte:
141     targetVar = "bytetest_%x" % (lbyte)
142     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
143     return matched_ifsequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
144   # We now have a range involving different bytes at position n.
145   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
146   # A separate test may be needed for the high byte sequence if
147   # there are constraints on following suffix bytes.
148   if hi & following_suffix_mask != following_suffix_mask:
149     hi_floor = hi &~following_suffix_mask     
150     hiVar = matched_ifsequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
151     loVar = matched_ifsequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
152     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
153   # A separate test may be needed for the low byte sequence if
154   # there are constraints on following suffix bytes.
155   if lo & following_suffix_mask != 0:
156     low_ceil = lo | following_suffix_mask
157     hiVar = matched_ifsequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
158     loVar = matched_ifsequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
159     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
160   #
161   # Now we have a range that permits all suffix combinations.
162   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
163   # has been validated.
164   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
165   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
166   if n == 1: return targetVar
167   return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
168
169
170# Generate a simplest possible test for a Unicode codepoint range
171# such that each 1 bit marks a position within a UTF-8 initial
172# subsequence such that each legal continuation of that subsequence
173# is within the range.  Return the generated variable.
174def utf8_ifrange_compiler(cgo, lo, hi):
175   lo_len = utf8_length(lo)
176   hi_len = utf8_length(hi)
177   # If different length code unit sequences are involved, make
178   # a union of equilength subranges.
179   if hi_len > lo_len:
180     m = max_codepoint_of_length(hi_len - 1)
181     v_lo = utf8_ifrange_compiler(cgo, lo, m)
182     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
183     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
184   #
185   else:
186     return matched_ifsequence_compiler(cgo, lo, hi, hi_len)
187
188def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
189  if bytecount == 0: return [make_assign(targetVar, "1")]
190  byte1 = utf8_byte(codepoint, 1)
191  stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
192  byteno = 1
193  while byteno < bytecount:
194    byteno += 1
195    sfx_byte = utf8_byte(codepoint, byteno)
196    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
197  return stmts
198
199def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
200  if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
201  byteno = startbyte
202  while byteno < endbyte:
203    byteno += 1
204    sfx_byte = utf8_byte(codepoint, byteno)
205    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
206  return stmts
207
208import re
209
210Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
211Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
212
213
214def parse_general():
215  category_size = {}
216  category_def = {}
217  f = open("DerivedGeneralCategory.txt")
218  lines = f.readlines()
219  for t in lines:
220    m = Unicode_point_regexp.match(t)
221    if m:
222      point = m.group(1)
223      category = m.group(2)
224      if not category_size.has_key(category):
225        category_size[category] = 0
226        category_def[category] = []
227      pval = int(point, 16)
228      category_def[category].append((pval, pval))
229      category_size[category] += 1     
230    m = Unicode_range_regexp.match(t)
231    if m:
232      point1 = m.group(1)
233      point2 = m.group(2)
234      category = m.group(3)
235      if not category_size.has_key(category):
236        category_size[category] = 0
237        category_def[category] = []
238      pval1 = int(point1, 16)
239      pval2 = int(point2, 16)
240      category_def[category].append((pval1, pval2))
241      category_size[category] += 1
242  return (category_size, category_def)
243  f.close()
244
245
246
247def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
248#   inner_code = []
249   (outer_lo, outer_hi) = enclosingRange
250   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
251   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
252   for rg in missingRanges:
253     (rglo, rghi) = rg
254     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap)
255   topRanges = outerRanges(enclosedRanges)
256   inner = innerRanges(enclosedRanges)
257   for rg in topRanges:
258     (rglo, rghi) = rg
259     inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
260     inner_cgo.add_common_expressions(cgo)
261     generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
262     if inner_cgo.generated_code != []:
263        range_var = utf8_ifrange_compiler(cgo, rglo, rghi)
264        cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
265   return cgo.showcode()
266
267def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
268   for k in charClassMap.keys():
269     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
270     # Divide by UTF-8 length
271     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]:
272        (lo1, hi1) = byte_range
273        subcc2 = rangeIntersect(subcc1, lo1, hi1)
274        ulen = utf8_length(lo1)
275        for subrange in subcc2:
276           (lo2, hi2) = subrange
277           subrangeE = matched_sequence_compiler(cgo, lo2, hi2, ulen)
278           if options.grep:
279              target = "all_chars"
280           else:
281              target = "struct_%s.cc" % k
282           cgo.add_assignment(target, cgo.expr2py(make_or(Var(subrangeE), Var(target))))
283
284def rangeIntersect(ccList, lo, hi):
285    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
286
287def rangeGaps(ccList, lo, hi):
288    if lo >= hi: return []
289    if ccList == []: return [(lo, hi)]
290    (lo1, hi1) = ccList[0]
291    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
292    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
293    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
294    else: return []
295
296def outerRanges(ccList):
297    if len(ccList) <= 1: return ccList
298    (lo1, hi1) = ccList[0]
299    (lo2, hi2) = ccList[1]
300    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
301    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
302
303def innerRanges(ccList):
304    if len(ccList) <= 1: return []
305    (lo1, hi1) = ccList[0]
306    (lo2, hi2) = ccList[1]
307    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
308    else: return innerRanges(ccList[1:])
309
310
311
312def generateCharClassDefs(ifRangeList, charClassMap):
313   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
314   for k in charClassMap.keys():
315     if options.grep:
316         cgo.add_assignment("all_chars", '0')
317     else:
318         cgo.add_assignment("struct_%s.cc" % k, '0')
319   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
320   return cgo.showcode()
321 
322
323#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
324
325#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
326
327defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x100,0x2FF), 
328(0x300,0x36F), (0x370,0x3FF), (0x400,0x7FF), (0x400,0x4FF),  (0x500, 0x52F), (0x530, 0x58F), (0x590,0x5FF), (0x600,0x6FF), 
329(0x700,0x7FF), (0x700,0x74F), (0x750,0x77F), (0x750,0x77F), (0x780,0x7BF), (0x7C0,0x7FF),
330(0x800,0xFFFF), 
331(0x10000, 0x10FFFF)]
332
333
334
335Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
336Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
337Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
338
339def generateDefs1(general_category):
340  catmap = {}
341  catmap[general_category] = UnicodeCategoryMap[general_category]
342  struct = Unicode_CC_struct % (general_category)
343  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
344  if options.grep:
345        struct = r"""
346class Basis_bits():
347        bit_0 = 0
348        bit_1 = 0
349        bit_2 = 0
350        bit_3 = 0
351        bit_4 = 0
352        bit_5 = 0
353        bit_6 = 0
354        bit_7 = 0 
355 
356class Lex():
357        LF = (0)
358 
359class Output():
360        matches = 0
361
362def ParseLines(basis_bits, lex):
363        temp1 = (basis_bits.bit_0 | basis_bits.bit_1)
364        temp2 = (basis_bits.bit_2 | basis_bits.bit_3)
365        temp3 = (temp1 | temp2)
366        temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5)
367        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7)
368        temp6 = (temp4 & temp5)
369        lex.LF = (temp6 &~ temp3)
370
371"""
372        header = "def Demo(basis_bits, lex, output):\n"
373        main = "\n\ndef Main(basis_bits, lex, output):\n    ParseLines(basis_bits, lex)\n    Demo(basis_bits, lex, output)\n"
374  else:
375        struct = Unicode_CC_struct % (general_category)
376        header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
377        main = Unicode_dummy_main
378  if options.flat:
379      code = generateCharClassDefs([], catmap)
380  elif options.simple:
381      code = generateCharClassDefs([(0x80, 0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFF)], catmap)
382  else:
383      code = generateCharClassDefs(defaultIfRangeList, catmap)
384  if options.grep:
385      code += r"""
386        output.matches = 0
387        all_matches = pablo.Advance(all_chars & ~lex.LF)
388        if all_matches:
389                # Find the last match on each line and filter out all others
390                output.matches = pablo.MatchStar(all_matches, ~lex.LF) & lex.LF
391"""
392  return struct + header + "".join(code) + main
393
394
395def main():   
396
397    global options, UnicodeCategoryMap
398    # Option definition
399    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
400 
401    option_parser.add_option('-c', '--category',
402                             dest='category',
403                             type='string',
404                             default='Cc',
405                             help='general category; default: Cc',
406                             )
407    option_parser.add_option('-g', '--grep',
408                             dest='grep',
409                             action='store_true',
410                             default=False,
411                             help='Use grep template',
412                             ) 
413    option_parser.add_option('-f', '--flat',
414                             dest='flat',
415                             action='store_true',
416                             default=False,
417                             help='Flatten the calculations into a single basic block',
418                             ) 
419    option_parser.add_option('-s', '--simple',
420                             dest='simple',
421                             action='store_true',
422                             default=False,
423                             help='Use a simple if-structure on UTF-8 length',
424                             ) 
425    options, args = option_parser.parse_args(sys.argv[1:])
426
427    (catlen, UnicodeCategoryMap) = parse_general()
428   
429    code = ""
430    if options.category == '.':
431        for k in UnicodeCategoryMap.keys():
432            code += generateDefs1(k)
433   
434    else:
435        if options.category not in UnicodeCategoryMap:
436            #define the characters in the list
437            print "Unknown general category %s" % options.category
438            exit
439        code = generateDefs1(options.category)
440
441    if (len(args) == 1):
442        fh = open(args[0], "w")
443        fh.write(code)
444        fh.close()
445    elif len(args) == 0:
446        print code
447    else:
448        option_parser.print_usage()
449       
450
451if __name__ == "__main__": main()
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
Note: See TracBrowser for help on using the repository browser.