Changeset 679


Ignore:
Timestamp:
Oct 29, 2010, 2:15:15 PM (8 years ago)
Author:
ksherdy
Message:

Add UTF8 definition for use with both JSON and XML.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/charset_compiler.py

    r678 r679  
    154154DefinitionSet['LI_with_MarkupPass'] = DefinitionSet['LexicalItems_with_Digit'] + [CharSetDef('AmpHashSlash', ['&', '#', '/'])]
    155155
    156 #JSON Definitino Sets
    157 DefinitionSet['JSON_UTF8'] =            [
     156#
     157# Byte classifications in UTF-8 validation.
     158DefinitionSet['UTF8'] =         [
    158159                                CharSetDef('u8.unibyte', ['\x00-\x7F']),
    159160                                CharSetDef('u8.prefix', ['\xC0-\xFF']),
     
    173174                                ]
    174175
    175 DefinitionSet['JSON_Control'] =         [
     176DefinitionSet['JSON_Control'] = [
    176177                                #Control characters
    177178                                CharSetDef('ctrl.x00_x1F', ['\x00-\x1F']),
     
    182183                                ]
    183184
    184 DefinitionSet['JSON_Lexical'] =         [
     185DefinitionSet['JSON_Lexical'] = [
    185186                                #Object
    186187                                CharDef('lex.LCurlyBrace','{'),
     
    218219                                ]
    219220       
    220 DefinitionSet['JSON'] = DefinitionSet['JSON_UTF8'] + DefinitionSet['JSON_Control'] + DefinitionSet['JSON_Lexical']
     221DefinitionSet['JSON'] = DefinitionSet['UTF8'] + DefinitionSet['JSON_Control'] + DefinitionSet['JSON_Lexical']
    221222
    222223#
     
    257258           CharSetDef('lex.NameScan', ['_', '-', '.', '0-:', 'A-Z', 'a-z', '\x80-\xFF'])]
    258259
    259 #
    260 # Byte classifications in UTF-8 validation.
    261 UTF8_defs = [CharSetDef('u8.unibyte', ['\x00-\x7F']),
    262              CharSetDef('u8.prefix', ['\xC0-\xFF']),
    263              CharSetDef('u8.prefix2', ['\xC0-\xDF']),
    264              CharSetDef('u8.prefix3', ['\xE0-\xEF']),
    265              CharSetDef('u8.prefix4', ['\xF0-\xFF']),
    266              CharSetDef('u8.suffix', ['\x80-\xBF']),
    267              CharSetDef('u8.badprefix', ['\xC0-\xC1', '\xF5-\xFF']),
    268              CharDef('u8.xE0', '\xE0'),
    269              CharDef('u8.xED', '\xED'),
    270              CharDef('u8.xF0', '\xF0'),
    271              CharDef('u8.xF4', '\xF4'),
    272              CharSetDef('u8.xA0_xBF', ['\xA0-\xBF']),
    273              CharSetDef('u8.x80_x9F', ['\x80-\x9F']),
    274              CharSetDef('u8.x90_xBF', ['\x90-\xBF']),
    275              CharSetDef('u8.x80_x8F', ['\x80-\x8F'])]
    276 
    277 
    278260#
    279261UTF8_BOM_bytes = [CharDef('u8.xEF', '\xEF'), CharDef('u8.xBF', '\xBF'), CharDef('u8.xBE', '\xBE')]
    280262
    281263DefinitionSet['parabix2'] = (xml_marks + namelex + DefinitionSet['WS_Control_10']
    282                              + DefinitionSet['Digit_and_Hex'] + UTF8_defs + UTF8_BOM_bytes)
    283                              
    284 DefinitionSet['UTF8'] = UTF8_defs
     264                             + DefinitionSet['Digit_and_Hex'] + DefinitionSet['UTF8'] + UTF8_BOM_bytes)
    285265
    286266DefinitionSet['CSV'] = [CharDef('BackSlash', '\\'),
Note: See TracChangeset for help on using the changeset viewer.