Changeset 3930


Ignore:
Timestamp:
Jul 28, 2014, 5:58:33 AM (5 years ago)
Author:
cameron
Message:

Initial generation of if-nest hierarchy for Unicode character class defs

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/RE/doc/UTF8_class.py

    r3433 r3930  
    11#
    2 # Prototype for computing UTF8 character classes
     2# Prototype for computing utf8 character classes
    33# Assuming byte-class/byte-range compilers exist.
    44#
     
    1414def make_and(e1, e2): return "(%s & %s)" % (e1, e2)
    1515def make_shift_forward(e, n): return "(%s >> %i)" % (e, n)
    16 
     16def make_assign(v, e): return v + " := " + e + "\n"
     17def make_if(p, s): return ["if %s\n" %p] + s + ["endif %s\n" %p]
    1718#
    18 def UTF8_length(codepoint):
     19def utf8_length(codepoint):
    1920   if codepoint <= 0x7F: return 1
    2021   elif codepoint <= 0x7FF: return 2
     
    2223   else: return 4
    2324
    24 def UTF8_byte(codepoint, n):
    25    len = UTF8_length(codepoint)
     25def utf8_byte(codepoint, n):
     26   len = utf8_length(codepoint)
    2627   if n == 1:
    2728     if len == 1: return codepoint
     
    3940   else: return 0x10FFFF
    4041
     42#
     43# Given two codepoints lo, hi: return the number of
     44# leading UTF-8 bytes that their respective UTF-8
     45# representations have in common.
     46def common_utf8_leading_bytes(lo, hi):
     47   u8len_lo = utf8_length(lo)
     48   u8len_hi = utf8_length(hi)
     49   if u8len_lo != u8len_hi: return 0
     50   remaining = u8len_lo
     51   while remaining > 0:
     52     if lo == hi: return remaining
     53     lo >>= 6
     54     hi >>= 6
     55     remaining -= 1
     56   return 0
     57
    4158
    4259def matched_sequence_compiler(lo, hi, n, hlen):
    4360   """ Helper function to generate the code necessary to match bytes
    44        n through hlen (1-based indexing) of the range of UTF-8 sequences
     61       n through hlen (1-based indexing) of the range of utf-8 sequences
    4562       for codepoints lo through hi. """
    46    hbyte = UTF8_byte(hi, n)
    47    lbyte = UTF8_byte(lo, n)
     63   hbyte = utf8_byte(hi, n)
     64   lbyte = utf8_byte(lo, n)
    4865   if n == hlen: return ByteRangeCompiler(lbyte, hbyte)
    4966   #
     
    6986   #
    7087   # Now we have a range that permits all suffix combinations.
    71    # We don't have to test suffixes UNDER THE ASSUMPTION that the UTF-8
     88   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
    7289   # has been validated.
    7390   return make_shift_forward(ByteRangeCompiler(lbyte, hbyte), hlen - n)
    7491
    75 def UTF8_range_compiler(lo, hi):
    76    hlen = UTF8_length(hi)
     92def utf8_range_compiler(lo, hi):
     93   hlen = utf8_length(hi)
    7794   # If different length code unit sequences are involved, make
    7895   # a union of equilength subranges.
    79    if hlen > UTF8_length(lo):
     96   if hlen > utf8_length(lo):
    8097     m = max_codepoint_of_length(hlen - 1)
    81      return make_or(UTF8_range_compiler(lo, m), UTF8_range_compiler(m+1, hi))
     98     return make_or(utf8_range_compiler(lo, m), utf8_range_compiler(m+1, hi))
    8299   #
    83100   return matched_sequence_compiler(lo, hi, 1, hlen)
    84101
    85102
     103def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
     104  if bytecount == 0: return [make_assign(targetVar, "1")]
     105  byte1 = utf8_byte(codepoint, 1)
     106  stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
     107  byteno = 1
     108  while byteno < bytecount:
     109    byteno += 1
     110    sfx_byte = utf8_byte(codepoint, byteno)
     111    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
     112  return stmts
     113
     114def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
     115  if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
     116  byteno = startbyte
     117  while byteno < endbyte:
     118    byteno += 1
     119    sfx_byte = utf8_byte(codepoint, byteno)
     120    stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
     121  return stmts
     122
     123import re
     124
     125Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
     126Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
     127
     128
     129def parse_general():
     130  category_size = {}
     131  category_def = {}
     132  f = open("DerivedGeneralCategory.txt")
     133  lines = f.readlines()
     134  for t in lines:
     135    m = Unicode_point_regexp.match(t)
     136    if m:
     137      point = m.group(1)
     138      category = m.group(2)
     139      if not category_size.has_key(category):
     140        category_size[category] = 0
     141        category_def[category] = []
     142      pval = int(point, 16)
     143      category_def[category].append((pval, pval))
     144      category_size[category] += 1     
     145    m = Unicode_range_regexp.match(t)
     146    if m:
     147      point1 = m.group(1)
     148      point2 = m.group(2)
     149      category = m.group(3)
     150      if not category_size.has_key(category):
     151        category_size[category] = 0
     152        category_def[category] = []
     153      pval1 = int(point1, 16)
     154      pval2 = int(point2, 16)
     155      category_def[category].append((pval1, pval2))
     156      category_size[category] += 1
     157  return (category_size, category_def)
     158  f.close()
     159
     160
     161def generateCharClassDefsInIfHierarchy(enclosingRange, ifRangeList, charClassMap):
     162   generated_code = []
     163   (outer_lo, outer_hi) = enclosingRange
     164   while ifRangeList!=[]:
     165     (lo, hi) = ifRangeList[0]
     166     if lo >= outer_hi: break
     167     if hi > outer_hi: raise Exception("Bad range nested (%i, %i) within (%i, %i)\n" % (lo, hi, outer_lo, outer_hi))
     168     # We have more subranges of the enclosing range
     169     if lo > outer_lo:
     170       # An innermost nest, not further embedded.
     171       generated_code += generateCharClassSubDefs(outer_lo, lo-1, charClassMap)
     172     ifRangeList = ifRangeList[1:]
     173     range_var = "CC_%x_%x" % (lo, hi)
     174     generated_code.append(make_assign(range_var, utf8_range_compiler(lo, hi)))
     175     inner = generateCharClassDefsInIfHierarchy((lo, hi), ifRangeList, charClassMap)
     176     generated_code += make_if(range_var, inner)
     177     outer_lo = hi + 1
     178   # Final innermost_nest
     179   if outer_lo <= outer_hi:
     180     generated_code += generateCharClassSubDefs(outer_lo, outer_hi, charClassMap)
     181   return generated_code
     182
     183
     184def generateCharClassSubDefs(lo, hi, charClassMap):
     185   generatedCode = []
     186   for k in charClassMap.keys():
     187     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
     188     CC_var = "CC_" + k
     189     for pair in subcc1: 
     190       CC_range_var = "%s_%x_%x" % (k, pair[0], pair[1])
     191       generatedCode.append(make_assign(CC_range_var, utf8_range_compiler(pair[0], pair[1])))
     192       generatedCode.append(make_assign(CC_var, make_or(CC_var, CC_range_var)))
     193   return generatedCode
     194
     195def compileByteClass(lo, hi, var):
     196   return make_assign(var, "ByteClass(0x%x, 0x%x)" % (lo, hi))
     197
     198def rangeIntersect(ccList, lo, hi):
     199    return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
     200
     201def generateCharClassDefs(ifRangeList, charClassMap):
     202   generated_code = []
     203   for k in charClassMap.keys():
     204     generated_code.append(make_assign(k, "0"))
     205   generated_code += generateCharClassDefsInIfHierarchy((0, 0x10FFFF), ifRangeList, charClassMap)
     206   return generated_code
     207
     208(catlen, catdef) = parse_general()
     209
     210map1 = {}
     211map1['Cc'] = catdef['Cc']
     212ifr = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
     213
Note: See TracChangeset for help on using the changeset viewer.