Changeset 3978 for proto


Ignore:
Timestamp:
Aug 8, 2014, 8:23:46 PM (5 years ago)
Author:
cameron
Message:

Bug fix for Advance chains; if-hierarchy options; grep option

Location:
proto/charsetcompiler
Files:
1 added
4 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/CC_compiler.py

    r3964 r3978  
    273273            elif isinstance(expr, Adv):
    274274               e = self.expr_string_to_variable(self.expr2py(expr.operand))
    275                return 'Advance(%s, %i)' % (e, expr.offset)
     275               if expr.offset == 1: return 'Advance(%s)' % (e)
     276               else: return 'Advance(%s, %i)' % (e, expr.offset)
    276277            else: raise Exception("Bad expression: %s" % repr(expr))
    277278
  • proto/charsetcompiler/Makefile

    r3962 r3978  
    33PABLO_SRC=Nd.pablo
    44PABLO_TEMPLATE=category_template.h
     5PABLO_COMPILER=../Compiler/pablomain.py
     6GREP_SRC=Ndgrep.pablo
     7GREP_TEMPLATE=grep_template.cpp
     8GREPOUTFILE = src/catgrep.cpp
    59PABLO_COMPILER=../Compiler/pablomain.py
    610PABLO_ADD_DEBUG = # -a -b
     
    913        python $(PABLO_COMPILER) $(PABLO_SRC) -t $(PABLO_TEMPLATE) -o $(OUTFILE) $(PABLO_ADD_DEBUG)
    1014
     15catgrep:
     16        python $(PABLO_COMPILER) $(GREP_SRC) -t $(GREP_TEMPLATE) -o $(GREPOUTFILE) $(PABLO_ADD_DEBUG)
  • proto/charsetcompiler/pablo_expr.py

    r3947 r3978  
    6767        self.operand = expr
    6868        self.offset = n
    69     def __str__(self): return 'Advance(%s, %i)' % (self.operand.__str__(), self.offset)
     69    def __str__(self):
     70       if self.offset == 1: return 'Advance(%s)' % (self.operand.__str__())
     71       else: return 'Advance(%s, %i)' % (self.operand.__str__(), self.offset)
    7072    def toAST(self): return ast.Call(ast.Attribute(ast.Name('pablo', ast.Load()), 'Advance', ast.Load()), [self.operand.toAST(), ast.Num(self.offset)])
    7173
  • proto/charsetcompiler/unicode_category_compiler.py

    r3968 r3978  
    7373     matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
    7474
    75 def matched_sequence_compiler(cgo, lo, hi, n, hlen, targetVar):
     75
     76def matched_sequence_compiler(cgo, lo, hi, hlen):
     77   return matched_sequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
     78
     79def matched_sequence_helper(cgo, lo, hi, prefix, n, hlen):
    7680   """ Helper function to generate the code necessary to match bytes
    7781       n through hlen (1-based indexing) of the range of utf-8 sequences
     
    8084   lbyte = utf8_byte(lo, n)
    8185   if n == hlen:
     86     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
    8287     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    83      return
     88     if n == 1: return targetVar
     89     return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
    8490   #
    8591   # One or more bytes of the lower and upper bound may be the same.
    8692   # Build a sequence of byte tests.
    8793   if hbyte == lbyte:
    88      sfxVar = targetVar + "_sfx"
    89      matched_sequence_compiler(cgo, lo, hi, n+1, hlen, sfxVar)
    90      CCvar = "CC_%x" % (hbyte)
    91      cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
    92      cgo.add_assignment(targetVar, cgo.expr2py(make_and(make_shift_forward(Var(CCvar), 1), Var(sfxVar))))
    93      return
     94     targetVar = "bytetest_%x" % (lbyte)
     95     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     96     return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
    9497   # We now have a range involving different bytes at position n.
    9598   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
     
    97100   # there are constraints on following suffix bytes.
    98101   if hi & following_suffix_mask != following_suffix_mask:
    99      hi_floor = hi &~following_suffix_mask
    100      hiVar = targetVar + "_hi"
    101      loVar = targetVar + "_lo"
    102      matched_sequence_compiler(cgo, hi_floor, hi, n, hlen, hiVar)
    103      matched_sequence_compiler(cgo, lo, hi_floor - 1, n, hlen, loVar)
    104      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
    105      return
     102     hi_floor = hi &~following_suffix_mask     
     103     hiVar = matched_sequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
     104     loVar = matched_sequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
     105     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
    106106   # A separate test may be needed for the low byte sequence if
    107107   # there are constraints on following suffix bytes.
    108108   if lo & following_suffix_mask != 0:
    109109     low_ceil = lo | following_suffix_mask
    110      hiVar = targetVar + "_hi"
    111      loVar = targetVar + "_lo"
    112      matched_sequence_compiler(cgo, low_ceil + 1, hi, n, hlen, hiVar)
    113      matched_sequence_compiler(cgo, lo, low_ceil, n, hlen, loVar)
    114      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
    115      return
     110     hiVar = matched_sequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
     111     loVar = matched_sequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
     112     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
    116113   #
    117114   # Now we have a range that permits all suffix combinations.
    118115   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
    119116   # has been validated.
    120    CCvar = "CC_%x_%x" % (lbyte, hbyte)
    121    cgo.chardef2py(CanonicalCharSetDef(CCvar, [(lbyte, hbyte)]))
    122    cgo.add_assignment(targetVar, cgo.expr2py(Adv(Var(CCvar), hlen - n)))
    123 
     117   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
     118   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     119   if n == 1: return targetVar
     120   return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
     121
     122
     123def matched_ifsequence_compiler(cgo, lo, hi, hlen):
     124   return matched_ifsequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
     125
     126def matched_ifsequence_helper(cgo, lo, hi, prefix, n, hlen):
     127   """ Helper function to generate the code necessary to match bytes
     128       n through hlen (1-based indexing) of the range of utf-8 sequences
     129       for codepoints lo through hi. """
     130   hbyte = utf8_byte(hi, n)
     131   lbyte = utf8_byte(lo, n)
     132   if n == hlen:
     133     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
     134     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     135     if n == 1: return targetVar
     136     else: return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
     137   #
     138   # One or more bytes of the lower and upper bound may be the same.
     139   # Build a sequence of byte tests.
     140   if hbyte == lbyte:
     141     targetVar = "bytetest_%x" % (lbyte)
     142     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     143     return matched_ifsequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
     144   # We now have a range involving different bytes at position n.
     145   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
     146   # A separate test may be needed for the high byte sequence if
     147   # there are constraints on following suffix bytes.
     148   if hi & following_suffix_mask != following_suffix_mask:
     149     hi_floor = hi &~following_suffix_mask     
     150     hiVar = matched_ifsequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
     151     loVar = matched_ifsequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
     152     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
     153   # A separate test may be needed for the low byte sequence if
     154   # there are constraints on following suffix bytes.
     155   if lo & following_suffix_mask != 0:
     156     low_ceil = lo | following_suffix_mask
     157     hiVar = matched_ifsequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
     158     loVar = matched_ifsequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
     159     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
     160   #
     161   # Now we have a range that permits all suffix combinations.
     162   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
     163   # has been validated.
     164   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
     165   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     166   if n == 1: return targetVar
     167   return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
    124168
    125169
     
    137181     v_lo = utf8_ifrange_compiler(cgo, lo, m)
    138182     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
    139      range_var = "test_%x_%x" % (lo, hi)
    140      cgo.add_assignment(range_var, cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
    141      return range_var
     183     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
    142184   #
    143185   else:
    144      return matched_ifsequence_compiler(cgo, lo, hi, 1, hi_len)
    145 
    146 
    147 def matched_ifsequence_compiler(cgo, lo, hi, n, hlen):
    148    """ Helper function to generate the code necessary to match bytes
    149        n through hlen (1-based indexing) of the range of utf-8 sequences
    150        for codepoints lo through hi. """
    151    hbyte = utf8_byte(hi, n)
    152    lbyte = utf8_byte(lo, n)
    153    if n == hlen:
    154      targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
    155      cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    156      return targetVar
    157    #
    158    # One or more bytes of the lower and upper bound may be the same.
    159    # Build a sequence of byte tests.
    160    if hbyte == lbyte:
    161      sfxVar = matched_ifsequence_compiler(cgo, lo, hi, n+1, hlen)
    162      targetVar = "bytetest_%x" % (lbyte)
    163      cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    164      var2 = targetVar+"_adv"
    165      cgo.add_assignment(var2, cgo.expr2py(make_and(make_shift_forward(Var(targetVar), 1), Var(sfxVar))))
    166      return var2
    167    # We now have a range involving different bytes at position n.
    168    following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
    169    # A separate test may be needed for the high byte sequence if
    170    # there are constraints on following suffix bytes.
    171    if hi & following_suffix_mask != following_suffix_mask:
    172      hi_floor = hi &~following_suffix_mask
    173      hiVar = matched_ifsequence_compiler(cgo, hi_floor, hi, n, hlen)
    174      loVar = matched_ifsequence_compiler(cgo, lo, hi_floor - 1, n, hlen)
    175      targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
    176      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
    177      return targetVar
    178    # A separate test may be needed for the low byte sequence if
    179    # there are constraints on following suffix bytes.
    180    if lo & following_suffix_mask != 0:
    181      low_ceil = lo | following_suffix_mask
    182      hiVar = matched_ifsequence_compiler(cgo, low_ceil + 1, hi, n, hlen)
    183      loVar = matched_ifsequence_compiler(cgo, lo, low_ceil, n, hlen)
    184      targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
    185      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
    186      return targetVar
    187    #
    188    # Now we have a range that permits all suffix combinations.
    189    # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
    190    # has been validated.
    191    targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
    192    cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    193    return targetVar
    194 
    195 
    196 
     186     return matched_ifsequence_compiler(cgo, lo, hi, hi_len)
    197187
    198188def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
     
    285275        for subrange in subcc2:
    286276           (lo2, hi2) = subrange
    287            CC_var = "CC_%s_%x_%x" % (k, lo2, hi2)
    288            matched_sequence_compiler(cgo, lo2, hi2, 1, ulen, CC_var)
    289            cgo.add_assignment("struct_%s.cc" % k, cgo.expr2py(make_or(Var("struct_%s.cc" % k), Var(CC_var))))
     277           subrangeE = matched_sequence_compiler(cgo, lo2, hi2, ulen)
     278           if options.grep:
     279              target = "output.matches"
     280           else:
     281              target = "struct_%s.cc" % k
     282           cgo.add_assignment(target, cgo.expr2py(make_or(Var(subrangeE), Var(target))))
    290283
    291284def rangeIntersect(ccList, lo, hi):
     
    320313   cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
    321314   for k in charClassMap.keys():
    322      cgo.add_assignment("struct_%s.cc" % k, '0')
     315     if options.grep:
     316         cgo.add_assignment("output.matches", '0')
     317     else:
     318         cgo.add_assignment("struct_%s.cc" % k, '0')
    323319   generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap)
    324320   return cgo.showcode()
     
    335331(0x10000, 0x10FFFF)]
    336332
     333
     334
    337335Unicode_CC_struct = "class struct_%s:\n\tcc = 0\n\n"
    338336Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
     
    344342  struct = Unicode_CC_struct % (general_category)
    345343  header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
    346   code = generateCharClassDefs(defaultIfRangeList, catmap)
    347   return struct + header + "".join(code)
     344  if options.grep:
     345        struct = r"""
     346class Basis_bits():
     347        bit_0 = 0
     348        bit_1 = 0
     349        bit_2 = 0
     350        bit_3 = 0
     351        bit_4 = 0
     352        bit_5 = 0
     353        bit_6 = 0
     354        bit_7 = 0 
     355 
     356class Lex():
     357        LF = (0)
     358 
     359class Output():
     360        matches = 0
     361
     362def ParseLines(basis_bits, lex):
     363        temp1 = (basis_bits.bit_0 | basis_bits.bit_1)
     364        temp2 = (basis_bits.bit_2 | basis_bits.bit_3)
     365        temp3 = (temp1 | temp2)
     366        temp4 = (basis_bits.bit_4 &~ basis_bits.bit_5)
     367        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7)
     368        temp6 = (temp4 & temp5)
     369        LF = (temp6 &~ temp3)
     370
     371"""
     372        header = "def Demo(basis_bits, lex, output):\n"
     373        main = "\n\ndef Main(basis_bits, lex, output):\n    ParseLines(basis_bits, lex)\n    Demo(basis_bits, lex, output)\n"
     374  else:
     375        struct = Unicode_CC_struct % (general_category)
     376        header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
     377        main = Unicode_dummy_main
     378  if options.flat:
     379      code = generateCharClassDefs([], catmap)
     380  elif options.simple:
     381      code = generateCharClassDefs([(0x80, 0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFF)], catmap)
     382  else:
     383      code = generateCharClassDefs(defaultIfRangeList, catmap)
     384  return struct + header + "".join(code) + main
    348385
    349386
     
    360397                             help='general category; default: Cc',
    361398                             )
     399    option_parser.add_option('-g', '--grep',
     400                             dest='grep',
     401                             action='store_true',
     402                             default=False,
     403                             help='Use grep template',
     404                             ) 
     405    option_parser.add_option('-f', '--flat',
     406                             dest='flat',
     407                             action='store_true',
     408                             default=False,
     409                             help='Flatten the calculations into a single basic block',
     410                             ) 
     411    option_parser.add_option('-s', '--simple',
     412                             dest='simple',
     413                             action='store_true',
     414                             default=False,
     415                             help='Use a simple if-structure on UTF-8 length',
     416                             ) 
    362417    options, args = option_parser.parse_args(sys.argv[1:])
    363418
     
    376431        code = generateDefs1(options.category)
    377432
    378     code += Unicode_dummy_main
    379 
    380433    if (len(args) == 1):
    381434        fh = open(args[0], "w")
Note: See TracChangeset for help on using the changeset viewer.