Changeset 3349


Ignore:
Timestamp:
Jun 24, 2013, 5:57:34 AM (6 years ago)
Author:
cameron
Message:

Add UTF-16 and UTF-32 modes for charsetcompiler.py

Location:
proto/charsetcompiler
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/charset_compiler.py

    r2996 r3349  
    22#  Character Class Compiler
    33#
    4 #  Version 0.8 - Feb. 17, 2012
    5 #     Add command-line option for specifying basis_pattern
    6 #  Copyright (c) 2007-12, Robert D. Cameron
    7 #  All rights reserved.
    8 #
    9 #  TO DO Notes
    10 #
    11 #  1.  Perhaps the character set definition should be extended to
    12 #      allow other set operations.   For example, the 'Restricted' set
    13 #      of XML might be defined as ['\x00-\x1f'] - ['\x09', '\x0a', '\x0d']
    14 #      This would require fewer operations to compute.
    15 #
    16 #  2.  The range logic generator should be modified to group
    17 #      bit variables the same way as the individual character logic
    18 #      generator (i.e., combining bits 0 and 1, 2 and 3, 4 and 5 and
    19 #      6 and 7 first.
    20 #
    21 #  3.  Extend for 16-bit and full Unicode character values.
    22 #
     4#  Version 0.9 - June 24, 2013
     5#
     6#  Copyright (c) 2007-13, Robert D. Cameron
     7#  Licensed to the public under Open Software License 3.0
     8#
     9#  Initial UTF-16 and UTF-32 support added by Dale Denis, June 2013.
     10#
     11#  TO DO
     12#    - add autosensing of 16/32 bit characters from input files
     13#    - optimization of range logic for 16-bit char sets.
    2314#
    2415#--------------------------------------------------------------------------
     
    4132
    4233import charset_input_parser
     34
     35class UTF_Encoding_Type:
     36    def __init__(self, name, bits, mask):
     37        self.name = name
     38        self.bits = bits
     39        self.mask = mask
     40        self.basis_pattern = []
     41    def __str__(self): return self.name
     42
     43
     44Encoding = UTF_Encoding_Type
     45
    4346
    4447#
     
    5659        self.varname = varname
    5760    def show(self): return 'Var("' + self.varname + '")'
     61    def __str__(self): return 'Var("' + self.varname + '")'
    5862
    5963class TrueLiteral(BoolExpr):
     
    6165        self.value = True
    6266    def show(self): return 'T'
     67    def __str__(self): return 'T'
    6368
    6469class FalseLiteral(BoolExpr):
     
    7176        self.operand = expr
    7277    def show(self): return 'Not(%s)' % (self.operand.show())
     78    def __str__(self): return 'Not(%s)' % (self.operand.show())
    7379
    7480class And(BoolExpr):
     
    7783        self.operand2 = expr2
    7884    def show(self): return 'And(%s, %s)' % (self.operand1.show(), self.operand2.show())
     85    def __str__(self): return 'And(%s, %s)' % (self.operand1.show(), self.operand2.show())
    7986
    8087class Or(BoolExpr):
     
    96103        self.false_branch = expr3
    97104    def show(self): return 'Sel(%s, %s, %s)' % (self.sel.show(), self.true_branch.show(), self.false_branch.show())
     105    def __str__(self): return 'Sel(%s, %s, %s)' % (self.sel.show(), self.true_branch.show(), self.false_branch.show())
    98106
    99107
     
    235243        else: return False
    236244
    237 
    238245#
    239246#
     
    242249
    243250def bit_var(n):
     251
     252#    return 'bit[%i]' % n
     253
    244254    global options
    245 #    return 'bit[%i]' % n
    246     return options.basis_pattern % n
     255    global Encoding
     256
     257    if len(Encoding.basis_pattern) == 1:
     258        return Encoding.basis_pattern[0] % n
     259   
     260    if Encoding.name == "UTF-16":
     261        if options.little_endian == True:
     262            if n >= 8:
     263                return Encoding.basis_pattern[0] % (n - 8)
     264            else:
     265                return Encoding.basis_pattern[1] % n
     266        else:
     267            if n <= 7:
     268                return Encoding.basis_pattern[0] % n
     269            else:
     270                return Encoding.basis_pattern[1] % (n - 8)
     271
     272    if Encoding.name == "UTF-32":
     273        if options.little_endian == True:
     274            if n >= 21:
     275                return "unused_bit%i" % (n - 21)
     276            elif n < 21 and n >= 16:
     277                return Encoding.basis_pattern[0] % (n - 16)
     278            elif n < 16 and n >= 8:
     279                return Encoding.basis_pattern[1] % (n - 8)
     280            elif n < 8:
     281                return Encoding.basis_pattern[2] % n
     282        else:
     283            if n <= 10:
     284                return "unused_bit%i" % n
     285            elif n > 10 and n <= 15:
     286                return Encoding.basis_pattern[0] % (n - 8)
     287            elif n > 15 and n <= 23:
     288                return Encoding.basis_pattern[1] % (n - 16)
     289            elif n > 23:
     290                return Encoding.basis_pattern[2] % (n - 24)
     291
     292
    247293def make_bitv(n):
    248     return Var(bit_var(7-n))
    249 
    250 
     294       
     295    global options
     296
     297    if options.little_endian == True:
     298        return Var(bit_var(n))
     299    else:
     300        return Var(bit_var((Encoding.bits - 1) -n))
     301       
    251302def make_bit_test(pattern, bit_count):
    252303    if bit_count == 0: return TrueLiteral()
     
    255306    for i in range(0, bit_count):
    256307        if (pattern & test_bit) == 0:
    257             bit_terms.append(make_not(make_bitv(7-i)))
    258         else: bit_terms.append(make_bitv(7-i))
     308            bit_terms.append(make_not(make_bitv((Encoding.bits - 1)-i)))   
     309        else: bit_terms.append(make_bitv((Encoding.bits - 1)-i))           
    259310        test_bit >>= 1
    260311    while len(bit_terms) > 1:
     
    282333      selected_bits &= ~test_bit
    283334      bit_no += 1
     335     
    284336    while len(bit_terms) > 1:
    285337        new_terms = []
     
    289341            new_terms.append(bit_terms[-1])
    290342        bit_terms = new_terms
     343   
    291344    return bit_terms[0]
    292345   
     
    294347def char_test_expr(ch):
    295348    #return make_bit_test(ord(ch), 8)
    296     return bit_pattern_expr(ord(ch), 0xFF)
     349    return bit_pattern_expr(ord(ch), Encoding.mask) 
    297350
    298351def GE_Range(N, n):
     352
    299353    if N == 0: return TrueLiteral()
    300354    elif N % 2 == 0 and (n >> (N - 2)) == 0:
     
    330384BadRange = Exception()
    331385
    332 
    333386def Make_Range(n1, n2):  # require n2 >= n1
    334387    diff_bits = n1 ^ n2
     
    337390        diff_count += 1
    338391        diff_bits >>= 1
    339     if n2 < n1 or diff_count > 8: raise BadRange()
     392    if n2 < n1 or diff_count > Encoding.bits: raise BadRange() 
    340393    mask = 2**(diff_count) - 1
    341394    #common = make_bit_test(n1 >> diff_count, 8 - diff_count)
    342     common = bit_pattern_expr(n1 & ~mask, 0xFF^mask)
     395    common = bit_pattern_expr(n1 & ~mask, Encoding.mask^mask)   
    343396    if diff_count == 0: return common
    344397    mask = 2**(diff_count-1) - 1
    345398    lo_test = GE_Range(diff_count-1, n1 & mask)
    346399    hi_test = LE_Range(diff_count-1, n2 & mask)
     400
    347401    return make_and(common, make_sel(make_bitv(diff_count-1), hi_test, lo_test))
    348402
     
    355409        if charset_item[1] == '-' and ord(charset_item[0]) <= ord(charset_item[2]):
    356410             return Make_Range(ord(charset_item[0]), ord(charset_item[2]))
    357     print charset_item
    358411    raise BadCharSetItem
    359412
     
    361414    if chardef.items == []: return FalseLiteral()
    362415    e1 = char_or_range_expr(chardef.items[0])
    363     for i in range(1, len(chardef.items)):
     416    for i in range(1, len(chardef.items)):   
    364417        e1 = make_or(e1, char_or_range_expr(chardef.items[i]))
    365418    if chardef.complemented: return make_not(e1)
     
    376429        self.generated_code = []
    377430        self.common_expression_map = {}
    378         for sym in predeclared: self.common_expression_map[sym] = sym
     431        for sym in predeclared: self.common_expression_map[sym] = sym             
    379432        self.typedecl = typedecl
    380433    def add_assignment(self, varname, expr):
     
    383436        self.generated_code.append('\t%s%s = %s\n' % (self.typedecl, varname, expr))
    384437    def expr_string_to_variable(self, expr_string):
    385         if self.common_expression_map.has_key(expr_string): 
     438        if self.common_expression_map.has_key(expr_string):
    386439            return self.common_expression_map[expr_string]
    387440        else:
    388             self.gensym_counter += 1
    389             sym = self.gensym_template % self.gensym_counter
    390             self.add_assignment(sym, expr_string)
     441            self.gensym_counter += 1                           
     442            sym = self.gensym_template % self.gensym_counter 
     443            self.add_assignment(sym, expr_string) 
    391444            return sym
     445
    392446    def showcode(self):
    393447        s = ''
     
    436490
    437491def chardeflist2simd(chardeflist):
    438     cgo = CodeGenObject([bit_var(i) for i in range(0,8)])
     492    cgo = CodeGenObject([bit_var(i) for i in range(0,Encoding.bits)])
    439493    for d in chardeflist:
    440494        chardef2simd(cgo, d)
     
    480534def chardef2py(genobj, chardef):
    481535    genobj.add_assignment(chardef.name, expr2py(genobj, charset_expr(chardef)))
    482 
    483 
     536   
    484537def py_chardefmap(chardeflist):
    485538    defs = ["'%s' : %s" % (d.name,d.name) for d in chardeflist]
     
    487540
    488541def chardeflist2py(chardeflist):
    489     cgo = CodeGenObject([bit_var(i) for i in range(0,8)],'')
     542    cgo = CodeGenObject([bit_var(i) for i in range(0, Encoding.bits)],'')
    490543    for d in chardeflist:
    491544        chardef2py(cgo, d)
    492545    return cgo.showcode()# + "  return "+ py_chardefmap(chardeflist) + "\n"
    493546
    494 
    495 
    496547def main():
     548
     549    global Encoding   
    497550
    498551    global options
    499552    # Option definition
    500553    option_parser = optparse.OptionParser(usage='python %prog [options] <input file>', version='0.8')
    501 
     554 
     555    option_parser.add_option('-u', '--character_encoding',
     556                             dest='character_encoding',
     557                             type='string',
     558                             default='UTF-8',
     559                             help='character encoding; default: UTF-8',
     560                             ) 
    502561    option_parser.add_option('-b', '--basis_pattern',
    503562                             dest='basis_pattern',
     
    506565                             help='pattern for basis bit streams; default: basis_bits.bit_%i',
    507566                             )
     567    option_parser.add_option('-l', '--little_endian',
     568                             dest='little_endian',
     569                             action='store_true',
     570                             default=False,
     571                             help='sets bit numbering of the output to little-endian',
     572                             )
    508573    option_parser.add_option('-g', '--gensym_pattern',
    509574                             dest='gensym_pattern',
     
    532597    options, args = option_parser.parse_args(sys.argv[1:])
    533598
     599    error = False
     600
     601    # Set the encoding.
     602    if options.character_encoding == "UTF-32":
     603        Encoding = UTF_Encoding_Type(options.character_encoding, 32, 0xFFFFFFFF)
     604    elif options.character_encoding == "UTF-16":
     605        Encoding = UTF_Encoding_Type(options.character_encoding, 16, 0xFFFF)
     606    elif options.character_encoding == "UTF-8":
     607        Encoding = UTF_Encoding_Type(options.character_encoding, 8, 0xFF)
     608    else:
     609        print "ERROR: Invalid encoding format."
     610        error = True;
     611
     612    # If we have a valid encoding format then set the basis pattern.
     613    if error == False:
     614        Encoding.basis_pattern = string.split(options.basis_pattern, ",")
     615        if len(Encoding.basis_pattern) == 1:
     616            # If we have the default basis pattern string then adjust it
     617            # for UTF-16 or UTF-32.  If the encoding is UTF-8 then we will
     618            # leave it as is.
     619            if "basis_bits.bit_%i" in Encoding.basis_pattern[0]:
     620                if "UTF-16" in Encoding.name:
     621                    Encoding.basis_pattern[0] = "u16_bit%i"
     622                elif "UTF-32" in Encoding.name:
     623                    Encoding.basis_pattern[0] = "u32_bit%i"
     624        elif len(Encoding.basis_pattern) == 2:
     625            if "UTF-16" not in Encoding.name:
     626                print "ERROR: Invalid encoding for basis pattern variables."
     627                error = True
     628        elif len(Encoding.basis_pattern) == 3:
     629            if "UTF-32" not in Encoding.name:
     630                print "ERROR: Invalid encoding for basis pattern variables."
     631                error = True
     632        else:
     633            print "ERROR: Invalid number of basis pattern variables."
     634            error = True
     635               
     636           
    534637    # Positional arguments
    535     if len(args) == 1:
     638    if (len(args) == 1) and (error == False):
    536639        # if the specified argument is not in the DefinitionSet, then assume that it's a filename
    537640        if args[0] not in DefinitionSet:
     
    540643        else: defs = DefinitionSet[args[1]]
    541644        if options.use_EBCDIC:
    542             defs = EBCDIC.ascii2ebcdic_chardeflist(defs)       
     645            defs = EBCDIC.ascii2ebcdic_chardeflist(defs)
    543646        stmts = chardeflist2py(defs)
    544647        if options.Pablo_skeleton or options.test_skeleton:
     
    572675          print "\ndef main():\n\tDo_defs(%s)\n" % params
    573676        else: print stmts 
    574 
    575677    else:
    576678        option_parser.print_usage()
     679       
    577680
    578681if __name__ == "__main__": main()
  • proto/charsetcompiler/charset_input_parser.py

    r2232 r3349  
    141141        # read per line
    142142        while (string != ""):
    143                 string = string.decode('string_escape')
     143                string = string.decode('unicode_escape')
    144144
    145145                # '#' indicates comment
Note: See TracChangeset for help on using the changeset viewer.