source: proto/charsetcompiler/unicode_category_compiler.py @ 4185

Last change on this file since 4185 was 4100, checked in by cameron, 5 years ago

Avoid generating redundant sequence_x_y definitions

File size: 22.7 KB
Line 
1#
2# Prototype for computing utf8 character classes
3# Assuming byte-class/byte-range compilers exist.
4#
5# Robert D. Cameron, June 2, 2013
6#
7# Licensed under Open Software License 3.0.
8
9from pablo_expr import *
10from CC_compiler import *
11from UTF_encoding import *
12from charset_def import *
13from UCD.general_category import *
14import optparse, sys
15
16#
17# Definitions for debugging/prototyping
18def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
19#
20def utf8_length(codepoint):
21   if codepoint <= 0x7F: return 1
22   elif codepoint <= 0x7FF: return 2
23   elif codepoint <= 0xFFFF: return 3
24   else: return 4
25
26def utf8_byte(codepoint, n):
27   lgth = utf8_length(codepoint)
28   if n == 1:
29     if lgth == 1: return codepoint
30     elif lgth == 2: return 0xC0 | (codepoint >> 6) 
31     elif lgth == 3: return 0xE0 | (codepoint >> 12) 
32     elif lgth == 4: return 0xF0 | (codepoint >> 18) 
33   else:
34     bits = (codepoint >> (6 * (lgth - n))) & 0x3F
35     return 0x80 | bits
36
37def max_codepoint_of_length(n):
38   if n == 1: return 0x7F
39   elif n == 2: return 0x7FF
40   elif n == 3: return 0xFFFF
41   else: return 0x10FFFF
42
43def max_codepoint_with_initial_byte(byte):
44   if byte <= 0x7F: return 0x7F
45   elif byte <= 0xDF: return ((byte & 0x1F) <<6) | 0x3F
46   elif byte == 0xED: return 0xD7FF
47   elif byte <= 0xEF: return ((byte & 0x0F) <<12) | 0xFFF
48   elif byte == 0xF4: return 0x10FFFF
49   else: return ((byte & 0x07) <<18) | 0x3FFFF
50
51def min_codepoint_with_initial_byte(byte):
52   if byte <= 0x7F: return 0
53   elif byte <= 0xDF: return ((byte & 0x1F) <<6)
54   elif byte == 0xE0: return 0x1000
55   elif byte <= 0xEF: return ((byte & 0x0F) <<12)
56   elif byte == 0xF0: return 0x10000
57   else: return ((byte & 0x07) <<18)
58
59#
60# Given two codepoints lo, hi: return the number of
61# leading UTF-8 bytes that their respective UTF-8
62# representations have in common.
63def common_utf8_leading_bytes(lo, hi):
64   u8len_lo = utf8_length(lo)
65   u8len_hi = utf8_length(hi)
66   if u8len_lo != u8len_hi: return 0
67   remaining = u8len_lo
68   while remaining > 0:
69     if lo == hi: return remaining
70     lo >>= 6
71     hi >>= 6
72     remaining -= 1
73   return 0
74
75
76def matched_ifsequence_compiler(cgo, lo, hi, hlen):
77   return matched_ifsequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
78
79def matched_ifsequence_helper(cgo, lo, hi, prefix, n, hlen):
80   """ Helper function to generate the code necessary to match bytes
81       n through hlen (1-based indexing) of the range of utf-8 sequences
82       for codepoints lo through hi. """
83   hbyte = utf8_byte(hi, n)
84   lbyte = utf8_byte(lo, n)
85   if n == hlen:
86     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
87     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
88     if n == 1: return targetVar
89     else: return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
90   #
91   # One or more bytes of the lower and upper bound may be the same.
92   # Build a sequence of byte tests.
93   if hbyte == lbyte:
94     targetVar = "bytetest_%x" % (lbyte)
95     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
96     return matched_ifsequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
97   # We now have a range involving different bytes at position n.
98   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
99   # A separate test may be needed for the high byte sequence if
100   # there are constraints on following suffix bytes.
101   if hi & following_suffix_mask != following_suffix_mask:
102     hi_floor = hi &~following_suffix_mask     
103     hiVar = matched_ifsequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
104     loVar = matched_ifsequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
105     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
106   # A separate test may be needed for the low byte sequence if
107   # there are constraints on following suffix bytes.
108   if lo & following_suffix_mask != 0:
109     low_ceil = lo | following_suffix_mask
110     hiVar = matched_ifsequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
111     loVar = matched_ifsequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
112     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
113   #
114   # Now we have a range that permits all suffix combinations.
115   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
116   # has been validated.
117   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
118   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
119   if n == 1: return targetVar
120   return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
121
122
123# Generate a simplest possible test for a Unicode codepoint range
124# such that each 1 bit marks a position within a UTF-8 initial
125# subsequence such that each legal continuation of that subsequence
126# is within the range.  Return the generated variable.
127def utf8_ifrange_compiler(cgo, lo, hi):
128   lo_len = utf8_length(lo)
129   hi_len = utf8_length(hi)
130   # If different length code unit sequences are involved, make
131   # a union of equilength subranges.
132   if hi_len > lo_len:
133     m = max_codepoint_of_length(hi_len - 1)
134     v_lo = utf8_ifrange_compiler(cgo, lo, m)
135     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
136     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
137   #
138   else:
139     return matched_ifsequence_compiler(cgo, lo, hi, hi_len)
140
141#
142# The test may be made up of up to three parts:
143# (a) a multibyte low-boundary test,
144# (b) a multibyte high-boundary test, and
145# (c) a range test.
146# It is possible that the low- and high- boundary tests have
147# a common multibyte prefix.
148def utf8_iftest_compiler(cgo, lo, hi):
149   lo_byte = utf8_byte(lo, 1)
150   hi_byte = utf8_byte(hi, 1)
151   if lo_byte == hi_byte:
152      targetVar = "cp_range_%x_%x" % (lo, hi)
153      utf8_sequence_generator([(lo, hi)], 1, targetVar, cgo)
154      return targetVar
155   if lo > 0 and utf8_byte(lo - 1, 1) == lo_byte:
156      lo1 = max_codepoint_with_initial_byte(lo_byte)
157      targetVar = "cp_range_%x_%x" % (lo, lo1)
158      utf8_sequence_generator([(lo, lo1)], 1, targetVar, cgo)
159      test_expr1 = Var(targetVar)
160      lo_byte = utf8_byte(lo1 + 1, 1)
161   else:
162      test_expr1 = FalseLiteral()
163      if lo == 0x80: lo_byte = 0xC0
164   if hi < 0x10FFFF and utf8_byte(hi + 1, 1) == hi_byte:
165      hi1 = min_codepoint_with_initial_byte(hi_byte)
166      targetVar = "cp_range_%x_%x" % (hi1, hi)
167      utf8_sequence_generator([(hi1, hi)], 1, targetVar, cgo)
168      test_expr2 = Var(targetVar)
169      hi_byte = utf8_byte(hi1 - 1, 1)
170   else:
171      test_expr2 = FalseLiteral()
172      if hi == 0x10FFFF: hi_byte = 0xFF
173   if lo_byte > hi_byte: return cgo.expr_string_to_variable(cgo.expr2py(make_or(test_expr1, test_expr2)))
174   if lo_byte == hi_byte:
175      byteVar = "byte_%x" % lo_byte
176   else:
177      byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
178   cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
179   return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(byteVar), make_or(test_expr1, test_expr2))))
180
181
182def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap):
183#   inner_code = []
184   (outer_lo, outer_hi) = enclosingRange
185   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
186   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
187   for rg in missingRanges:
188     (rglo, rghi) = rg
189     generateCharClassSubDefs(cgo, rglo, rghi, charClassMap)
190   topRanges = outerRanges(enclosedRanges)
191   inner = innerRanges(enclosedRanges)
192   for rg in topRanges:
193     (rglo, rghi) = rg
194     empty_range = True
195     for k in charClassMap.keys():
196        if rangeIntersect(charClassMap[k], rglo, rghi) != []:
197           empty_range = False
198           break
199     if not empty_range:
200       range_var = utf8_iftest_compiler(cgo, rglo, rghi)
201       inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
202       inner_cgo.add_common_expressions(cgo)
203       generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
204       if inner_cgo.generated_code != []:
205         cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
206   return cgo.showcode()
207
208def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
209   for k in charClassMap.keys():
210     if options.grep:
211        targetVar = "all_chars"
212     else:
213        targetVar = "struct_%s.cc" % k
214     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
215     # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
216     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
217        (lo1, hi1) = byte_range
218        subcc2 = rangeIntersect(subcc1, lo1, hi1)
219        utf8_sequence_generator(subcc2, 1, targetVar, cgo)
220
221def rangeIntersect(ccList, lo, hi):
222    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
223
224def rangeGaps(ccList, lo, hi):
225    if lo >= hi: return []
226    if ccList == []: return [(lo, hi)]
227    (lo1, hi1) = ccList[0]
228    if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
229    if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
230    elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
231    else: return []
232
233def outerRanges(ccList):
234    if len(ccList) <= 1: return ccList
235    (lo1, hi1) = ccList[0]
236    (lo2, hi2) = ccList[1]
237    if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
238    else: return [(lo1, hi1)] + outerRanges(ccList[1:])
239
240def innerRanges(ccList):
241    if len(ccList) <= 1: return []
242    (lo1, hi1) = ccList[0]
243    (lo2, hi2) = ccList[1]
244    if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
245    else: return innerRanges(ccList[1:])
246
247
248
249def generateCharClassDefs(ifRangeList, charClassMap):
250   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
251   for k in charClassMap.keys():
252     if options.grep:
253         cgo.add_assignment("all_chars", '0')
254     else:
255         cgo.add_assignment("struct_%s.cc" % k, '0')
256   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
257   return cgo.showcode()
258 
259
260#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
261
262#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
263
264
265defaultIfRangeList = [
266#Non-ASCII
267(0x80,0x10FFFF), 
268#Two-byte sequences
269(0x80,0x7FF), 
270(0x100, 0x3FF), 
271#0100..017F; Latin Extended-A
272#0180..024F; Latin Extended-B
273#0250..02AF; IPA Extensions
274#02B0..02FF; Spacing Modifier Letters
275(0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
276#0300..036F; Combining Diacritical Marks
277#0370..03FF; Greek and Coptic
278(0x300, 0x36F), (0x370, 0x3FF),
279#0400..04FF; Cyrillic
280#0500..052F; Cyrillic Supplement
281#0530..058F; Armenian
282#0590..05FF; Hebrew
283#0600..06FF; Arabic
284(0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
285#0700..074F; Syriac
286#0750..077F; Arabic Supplement
287#0780..07BF; Thaana
288#07C0..07FF; NKo
289(0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF), 
290#Three-byte sequences
291(0x800, 0xFFFF),
292(0x800, 0x4DFF),
293(0x800, 0x1FFF),
294(0x800, 0x0FFF),
295(0x1000, 0x1FFF),
296#0800..083F; Samaritan
297#0840..085F; Mandaic
298#08A0..08FF; Arabic Extended-A
299#0900..097F; Devanagari
300#0980..09FF; Bengali
301#0A00..0A7F; Gurmukhi
302#0A80..0AFF; Gujarati
303#0B00..0B7F; Oriya
304#0B80..0BFF; Tamil
305#0C00..0C7F; Telugu
306#0C80..0CFF; Kannada
307#0D00..0D7F; Malayalam
308#0D80..0DFF; Sinhala
309#0E00..0E7F; Thai
310#0E80..0EFF; Lao
311#0F00..0FFF; Tibetan
312(0x1000, 0x1FFF),
313#1000..109F; Myanmar
314#10A0..10FF; Georgian
315#1100..11FF; Hangul Jamo
316#1200..137F; Ethiopic
317#1380..139F; Ethiopic Supplement
318#13A0..13FF; Cherokee
319#1400..167F; Unified Canadian Aboriginal Syllabics
320#1680..169F; Ogham
321#16A0..16FF; Runic
322#1700..171F; Tagalog
323#1720..173F; Hanunoo
324#1740..175F; Buhid
325#1760..177F; Tagbanwa
326#1780..17FF; Khmer
327#1800..18AF; Mongolian
328#18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
329#1900..194F; Limbu
330#1950..197F; Tai Le
331#1980..19DF; New Tai Lue
332#19E0..19FF; Khmer Symbols
333#1A00..1A1F; Buginese
334#1A20..1AAF; Tai Tham
335#1AB0..1AFF; Combining Diacritical Marks Extended
336#1B00..1B7F; Balinese
337#1B80..1BBF; Sundanese
338#1BC0..1BFF; Batak
339#1C00..1C4F; Lepcha
340#1C50..1C7F; Ol Chiki
341#1CC0..1CCF; Sundanese Supplement
342#1CD0..1CFF; Vedic Extensions
343#1D00..1D7F; Phonetic Extensions
344#1D80..1DBF; Phonetic Extensions Supplement
345#1DC0..1DFF; Combining Diacritical Marks Supplement
346#1E00..1EFF; Latin Extended Additional
347#1F00..1FFF; Greek Extended
348(0x2000, 0x4DFF),(0x2000, 0x2FFF),
349(0x3000, 0x4DFF),
350(0x4E00,0x9FFF),
351#4E00..9FFF; CJK Unified Ideographs
352(0xA000,0xFFFF),
353
354(0x10000, 0x10FFFF)]
355
356
357Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
358Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
359Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
360
361def generateDefs1(general_category):
362  catmap = {}
363  catmap[general_category] = UnicodeCategoryMap[general_category]
364  struct = Unicode_CC_struct % (general_category)
365  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
366  if options.grep:
367        struct = r"""
368class Basis_bits():
369        bit_0 = 0
370        bit_1 = 0
371        bit_2 = 0
372        bit_3 = 0
373        bit_4 = 0
374        bit_5 = 0
375        bit_6 = 0
376        bit_7 = 0 
377 
378class Lex():
379        LF = (0)
380 
381class Output():
382        matches = 0
383
384def ParseLines(basis_bits, lex):
385        temp1 = (basis_bits.bit_0 | basis_bits.bit_1)
386        temp2 = (basis_bits.bit_2 | basis_bits.bit_3)
387        temp3 = (temp1 | temp2)
388        temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5)
389        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7)
390        temp6 = (temp4 & temp5)
391        lex.LF = (temp6 &~ temp3)
392
393"""
394        header = "def Demo(basis_bits, lex, output):\n"
395  else:
396        struct = Unicode_CC_struct % (general_category)
397        header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
398  if options.flat:
399      code = generateCharClassDefs([], catmap)
400  elif options.simple:
401      code = generateCharClassDefs([(0x80, 0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFF)], catmap)
402  else:
403      code = generateCharClassDefs(defaultIfRangeList, catmap)
404  if options.grep:
405      code += r"""
406        output.matches = 0
407        all_matches = pablo.Advance(all_chars & ~lex.LF)
408        if all_matches:
409                # Find the last match on each line and filter out all others
410                output.matches = pablo.MatchStar(all_matches, ~lex.LF) & lex.LF
411"""
412  return struct + header + "".join(code)
413
414def generate_main():
415  if options.grep:
416        main = "\n\ndef Main(basis_bits, lex, output):\n    ParseLines(basis_bits, lex)\n    Demo(basis_bits, lex, output)\n"
417  else:
418        main = Unicode_dummy_main
419  return main
420
421#
422# Partition a list of ranges into a minimum set of utf8 groups
423# UTF-8 prefix groups, where a group is
424# (a) a range of codepoints with UTF-8 prefixes of the same length
425#     such that every codepoint in the range is within the group, or
426# (b) a sublist all having the same UTF-8 initial
427#     byte
428def partition_by_UTF8_group(range_list, byte_no):
429    if range_list == []: return []
430    (lo, hi) = range_list[0]
431    u8len_lo = utf8_length(lo)
432    u8len_hi = utf8_length(hi)
433    if u8len_lo != u8len_hi:
434        mid = max_codepoint_of_length(u8len_lo)
435        return partition_by_UTF8_group([(lo, mid), (mid+1, hi)] + range_list[1:], byte_no)
436    lobyte1 = utf8_byte(lo, byte_no)
437    hibyte1 = utf8_byte(hi, byte_no)
438    if lobyte1 != hibyte1:
439        if not is_low_codepoint_after_byte(lo, byte_no):
440            lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
441            #print "lo--lo1:  %x--%x" % (lo, lo1)
442            return [[(lo, lo1)]] + partition_by_UTF8_group([(lo1+1, hi)] + range_list[1:], byte_no)
443        elif not is_high_codepoint_after_byte(hi, byte_no):
444            hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
445            #print "lo--hi-1:  %x--%x" % (lo, hi1-1)
446            return [[(lo, hi1-1)]] + partition_by_UTF8_group([(hi1, hi)] + range_list[1:], byte_no)
447        else:
448            # we have a prefix group of type (a)
449            return [[(lo, hi)]] + partition_by_UTF8_group(range_list[1:], byte_no)
450    group1 = [(lo, hi)]
451    subpartitions = partition_by_UTF8_group(range_list[1:], byte_no)
452    if subpartitions == []: return [group1]
453    elif utf8_byte(subpartitions[0][0][0], byte_no) == lobyte1:
454        return [group1 + subpartitions[0]] + subpartitions[1:]
455    else:
456        return [group1] + subpartitions
457
458#
459def is_low_codepoint_after_byte(codepoint, byte):
460    for i in range(byte, utf8_length(codepoint)):
461        if utf8_byte(codepoint, i+1) != 0x80: return False
462    return True
463
464def is_high_codepoint_after_byte(codepoint, byte):
465    for i in range(byte, utf8_length(codepoint)):
466        if utf8_byte(codepoint, i+1) != 0xBF: return False
467    return True
468
469# Ensure the sequence of preceding bytes is defined, up to, but
470# not including the given byte_no
471def ensure_preceding_prefix_defined(codepoint, byte_no, cgo):
472   for i in range(1, byte_no):
473      byte_i = utf8_byte(codepoint, i)
474      byteVar = "byte_%x" % byte_i
475      cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(byte_i, byte_i)]))
476      if i > 1:
477         pfx1 = utf8_prefix_var(codepoint, i-1)
478         pfx1_adv = pfx1 + "_adv"
479         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
480         pfx2 = utf8_prefix_var(codepoint, i)
481         cgo.add_canonical_assignment(pfx2, cgo.expr2py(make_and(Var(pfx1_adv), Var(byteVar))))
482
483
484#
485# Generate remaining code to match UTF-8 code sequences within
486# the codepoint set u8_partition, assuming that the code matching the
487# sequences up to byte number byte_no have been generated.
488#
489def utf8_sequence_generator(u8_partition, byte_no, targetVar, cgo):
490   if len(u8_partition) == 0: return
491   (lo, hi) = u8_partition[0]
492   if utf8_length(lo) == byte_no:
493      # We have a single byte remaining to match for all codepoints
494      # in this partition.  Use the byte class compiler to generate
495      # matches for these codepoints.
496      ensure_preceding_prefix_defined(lo, byte_no, cgo)
497      byte_pair_list = byte_definitions(u8_partition, byte_no)
498      #print byte_pair_list
499      if len(byte_pair_list) == 1:
500          (lobyte, hibyte) = byte_pair_list[0]
501          if lo == hi:
502              final_byte_var = "byte_%x" % lobyte
503          else:
504              final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
505          cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
506      else:
507          hi = u8_partition[-1][0]
508          final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
509          cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
510      test_expr = Var(final_byte_var)
511      if byte_no > 1: 
512         pfx1 = utf8_prefix_var(lo, byte_no-1)
513         pfx1_adv = pfx1 + "_adv"
514         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
515         test_expr = make_and(Var(pfx1_adv), test_expr)
516      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
517   else:
518     partitions = partition_by_UTF8_group(u8_partition, byte_no)
519     for p in partitions:
520       (lo, hi) = p[0]
521       lbyte = utf8_byte(lo, byte_no)
522       hbyte = utf8_byte(hi, byte_no)
523       ensure_preceding_prefix_defined(lo, byte_no, cgo)
524       if lbyte == hbyte:
525         byteVar = "byte_%x" % lbyte
526         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
527         if byte_no > 1:
528           last_prefix = utf8_prefix_var(lo, byte_no - 1)
529           this_prefix = utf8_prefix_var(lo, byte_no)
530           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
531         if byte_no < utf8_length(lo): utf8_sequence_generator(p, byte_no+1, targetVar, cgo)
532       else:
533         byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
534         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
535         if byte_no > 1:
536           last_prefix = utf8_prefix_var(lo, byte_no - 1)
537           this_prefix = last_prefix + "_" + byteVar
538           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
539         else: this_prefix = byteVar
540         suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
541         cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
542         last_prefix = this_prefix
543         while byte_no < utf8_length(lo):
544           byte_no += 1
545           this_prefix = last_prefix + "_sfx"
546           cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(suffixVar))))
547           last_prefix = this_prefix
548         cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
549
550
551
552def utf8_prefix_var(codepoint, prefix_bytes):
553   if prefix_bytes == 0: 
554      raise Exception ("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
555   elif prefix_bytes == 1:
556      return "byte_%x" % utf8_byte(codepoint, 1)
557   else: 
558      return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n+1) for n in range(prefix_bytes)])
559
560
561def byte_definitions(range_list, n):
562   #print ["%x--%x" % (p[0], p[1]) for p in range_list]
563   result = [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
564   #print ["%x--%x" % (p[0], p[1]) for p in result]
565   return result
566
567def main():   
568
569    global options, UnicodeCategoryMap
570    # Option definition
571    option_parser = optparse.OptionParser(usage='python %prog [options] <output file>', version='0.1')
572 
573    option_parser.add_option('-c', '--category',
574                             dest='category',
575                             type='string',
576                             default='Cc',
577                             help='general category; default: Cc',
578                             )
579    option_parser.add_option('-g', '--grep',
580                             dest='grep',
581                             action='store_true',
582                             default=False,
583                             help='Use grep template',
584                             ) 
585    option_parser.add_option('-f', '--flat',
586                             dest='flat',
587                             action='store_true',
588                             default=False,
589                             help='Flatten the calculations into a single basic block',
590                             ) 
591    option_parser.add_option('-s', '--simple',
592                             dest='simple',
593                             action='store_true',
594                             default=False,
595                             help='Use a simple if-structure on UTF-8 length',
596                             ) 
597    options, args = option_parser.parse_args(sys.argv[1:])
598
599    (catlen, UnicodeCategoryMap) = parse_general()
600   
601    code = ""
602    if options.category == '.':
603        for k in UnicodeCategoryMap.keys():
604            code += generateDefs1(k)
605   
606    else:
607        if options.category not in UnicodeCategoryMap:
608            #define the characters in the list
609            print "Unknown general category %s" % options.category
610            exit
611        code = generateDefs1(options.category)
612
613
614    code += generate_main()
615
616    if (len(args) == 1):
617        fh = open(args[0], "w")
618        fh.write(code)
619        fh.close()
620    elif len(args) == 0:
621        print code
622    else:
623        option_parser.print_usage()
624       
625
626
627
628
629if __name__ == "__main__": main()
630
631
632
633
634
635
636
Note: See TracBrowser for help on using the repository browser.