- Timestamp:
- Oct 3, 2017, 2:18:24 PM (17 months ago)
- Location:
- icGREP/icgrep-devel/UCD-scripts
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
icGREP/icgrep-devel/UCD-scripts/UCD_parser.py
r5661 r5662 76 76 return (property_enum_name_list, property_object_map) 77 77 78 79 # 80 # Property Default Value Specifications 81 # 82 # THe UCD uses special comment lines ("@missing specifications") to declare default 83 # values for properties. Examples showing the two common formats are: 84 # (1) Blocks.txt # @missing: 0000..10FFFF; No_Block 85 # (2) PropertyValueAliases.txt # @missing: 0000..10FFFF; Case_Folding; <code point> 86 # The general format gives a range of codepoints (generally 0000..10FFFF), 87 # an optional property name (if the file containing the specification defines 88 # many different properties), and the default value. 89 # 90 # There are some important default values for different property types: 91 # <codepoint>: This is a default value for certain String properties, 92 # indicating the default for a codepoint under the given property 93 # is to map to itself. 94 # <none>: This is a default for certain String properties indicating that 95 # the default value for a code point is the empty string. 96 # <script>: The default value for the ScriptExtnesions property is the 97 # value of the Script property. 98 # NaN The default value for numeric property is the NaN (not a number) value. 99 # 100 101 # Given a line known to contain such a @missing specification, 102 # parse_missing_spec(data_line) returns a (cp_lo, cp_hi, fields) triple. 103 # Generally, cp_lo = 0 and cp_hi = 0x10FFFF 104 # The list of fields contains one or two entries: an optional 105 # property name and the default value specified for the range. 106 # @missing specifications generally omit the property name when 107 # the file being processed is defined for a single property only. 108 # 109 UCD_missing_check = re.compile("^#\s*@missing:.*") 110 UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)") 111 112 def parse_missing_spec(data_line): 113 m = UCD_missing_regexp.match(data_line) 114 if not m: raise Exception("UCD missing spec parsing error: " + data_line) 115 cp_lo = int(m.group(1), 16) 116 cp_hi = int(m.group(2), 16) 117 # We may have to restructure in the event that missing specs do not cover the full Unicode range. 118 if cp_lo != 0 or cp_hi != 0x10FFFF: raise Exception("Unexpected range error in missing spec: " + data_line) 119 field_data = m.group(3) 120 fields = field_data.split(';') 121 fields = [f.lstrip().rstrip() for f in fields] 122 return (cp_lo, cp_hi, fields) 123 124 # 125 # Missing specifications and other types of UCD data records often produce 126 # a list of one or two fields which indicate a property and a value. 127 # 128 # parse_property_and_value(fields, property_lookup_map) checks that 129 # first of the given fields is indeed a property identifier identified 130 # in the given lookup map, and returns a pair consisting of the 131 # unique property code for the property, plus a corresponding value 132 # (or None, if only one field was given). 133 # 134 def parse_property_and_value(fields, property_lookup_map): 135 if len(fields) > 2: raise Exception("Too many fields") 136 if len(fields) == 0: raise Exception("Expecting at least 1 field") 137 canon = canonicalize(fields[0]) 138 if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str) 139 pcode = property_lookup_map[canon] 140 if len(fields) == 1: return (pcode, None) 141 else: return (pcode, fields[1]) 142 78 143 # 79 144 # UCD Property File Format 2: property value aliases … … 92 157 # non-enumerated types 93 158 159 94 160 def initializePropertyValues(property_object_map, property_lookup_map): 95 161 UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)") … … 100 166 for t in lines: 101 167 if UCD_skip.match(t): 102 m = UCD_property_value_missing_regexp.match(t) 103 if m: 104 if m.group(1) != '0000' or m.group(2) != '10FFFF': raise Exception("Bad missing spec: " + s) 105 cname = canonicalize(m.group(3)) 106 if not cname in property_lookup_map: raise Exception("Bad missing property: " + s) 107 property_object_map[property_lookup_map[cname]].setDefaultValue(m.group(4)) 168 if UCD_missing_check.match(t): 169 (cp_lo, cp_hi, fields) = parse_missing_spec(t) 170 (property_code, default_value) = parse_property_and_value(fields, property_lookup_map) 171 property_object_map[property_code].setDefaultValue(default_value) 108 172 continue # skip comment and blank lines 109 173 m = UCD_property_value_alias_regexp.match(t) … … 147 211 UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)") 148 212 213 # 214 # parse_data_record is a generic parser for most of the UCD data files. 215 # Given a data_line beginning with a codepoint or codepoint range, 216 # this function returns a (cp_lo, cp_hi, fields) triple givnig the 217 # low and high codepoints of the range (these values may be equal in 218 # the case of a single codepoint), as well as a list of fields. 219 # The semicolon separators are removed as well as leading or trailing 220 # whitespace for each field value. 221 149 222 def parse_data_record(data_line): 150 223 m = UCD_point_regexp.match(data_line) … … 169 242 return (cp_lo, cp_hi, fields) 170 243 171 UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)") 172 173 def parse_missing_spec(data_line): 174 m = UCD_missing_regexp.match(data_line) 175 if not m: raise Exception("UCD missing spec parsing error: " + data_line) 176 cp_lo = int(m.group(1), 16) 177 cp_hi = int(m.group(2), 16) 178 field_data = m.group(3) 179 fields = field_data.split(';') 180 fields = [f.lstrip().rstrip() for f in fields] 181 return (cp_lo, cp_hi, fields) 182 183 def parse_property_and_value(fields, property_lookup_map): 184 if len(fields) > 2: raise Exception("Too many fields") 185 if len(fields) == 0: raise Exception("Expecting at least 1 field") 186 canon = canonicalize(fields[0]) 187 if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str) 188 pcode = property_lookup_map[canon] 189 if len(fields) == 1: return (pcode, None) 190 else: return (pcode, fields[1]) 191 244 245 # parse_multisection_property_data parses such a file and populates 246 # the property objects for each property through successive calls to 247 # the corresponding addDataRecord method. 248 # 192 249 def parse_multisection_property_data(pfile, property_object_map, property_lookup_map): 193 250 f = open(UCD_config.UCD_src_dir + "/" + pfile) … … 214 271 return props 215 272 273 274 # 275 # Some UCD files are defined for a single property. 276 # parse_property_data deals with such a file, given the property 277 # object to populate and the file root. 278 # 279 216 280 def parse_property_data(property_object, pfile): 217 281 f = open(UCD_config.UCD_src_dir + "/" + pfile) … … 232 296 property_object.finalizeProperty() 233 297 298 299 # 300 # Some UCD files are organized to support multiple properties with one 301 # property per column. 302 # parse_multicolumn_property_data deals with such files given a list of 303 # property codes. 304 # 305 234 306 def parse_multicolumn_property_data(pfile, property_object_map, property_lookup_map, prop_code_list): 235 307 f = open(UCD_config.UCD_src_dir + "/" + pfile) … … 248 320 property_object_map[p].finalizeProperty() 249 321 250 def parse_ScriptExtensions_txt(script_property_object):251 filename_root = 'ScriptExtensions'252 parse_property_data(script_property_object, filename_root + '.txt')253 254 322 UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$") 255 323 256 NonName Range_regexp = re.compile("<([^>]*)>")324 NonName_regexp = re.compile("<([^>]*)>") 257 325 NameRange_regexp = re.compile("<([^,]*), (First|Last)>") 258 259 def parse_UnicodeData_txt():260 data_records = []261 range_records = []262 name_range_starts = {}263 f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")264 lines = f.readlines()265 for t in lines:266 if UCD_skip.match(t):267 continue # skip comment and blank lines268 m = UnicodeData_txt_regexp.match(t)269 if not m: raise Exception("Unknown syntax: %s" % t)270 (cp, name, gc) = (m.group(1), m.group(2), m.group(3))271 (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))272 (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))273 # Unicode 1 name and ISO comment are obolete274 (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))275 nonNameMatch = NonNameRange_regexp.match(name)276 if nonNameMatch:277 rangeMatch = NameRange_regexp.match(name)278 if rangeMatch:279 rangeName = rangeMatch.group(1)280 print(rangeName, rangeMatch.group(2))281 if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp282 if rangeMatch.group(2) == 'Last':283 if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)284 range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))285 continue286 data_records.append((cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))287 return (data_records, range_records)288 326 289 327 # Parse a decomposition mapping field in one of two forms: … … 291 329 # (b) canonical mappings: {codepoint} 292 330 compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$") 293 codepoints_regexp = re.compile("^[0-9A-F]{4,6}(?: +[0-9A-F]{4,6})*$")294 331 def parse_decomposition(s): 295 332 m = compatibility_regexp.match(s) … … 300 337 decomp_type = "Canonical" 301 338 mapping = s 302 m = codepoints_regexp.match(mapping) 303 if not m: raise Exception("Bad codepoint string syntax in parse_decomposition: %s" % mapping) 304 cps = [int(x, 16) for x in mapping.split(" ")] 305 return (decomp_type, cps) 306 339 return (decomp_type, mapping) 340 341 def parse_UnicodeData_txt(property_object_map): 342 data_records = [] 343 range_records = [] 344 name_range_starts = {} 345 f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt") 346 lines = f.readlines() 347 for t in lines: 348 if UCD_skip.match(t): 349 continue # skip comment and blank lines 350 m = UnicodeData_txt_regexp.match(t) 351 if not m: raise Exception("Unknown syntax: %s" % t) 352 (cp, name, gc) = (int(m.group(1), 16), m.group(2), m.group(3)) 353 (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10)) 354 (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9)) 355 # Unicode 1 name and ISO comment are obolete 356 (uc, lc, tc) = (m.group(13), m.group(14), m.group(15)) 357 rangeMatch = NameRange_regexp.match(name) 358 if rangeMatch: 359 rangeName = rangeMatch.group(1) 360 print(rangeName, rangeMatch.group(2)) 361 if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp 362 if rangeMatch.group(2) == 'Last': 363 if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t) 364 range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc)) 365 continue 366 if not NonName_regexp.match(name): 367 property_object_map['na'].addDataRecord(cp, cp, name) 368 if not decomp == '': 369 (decomp_type, mapping) = parse_decomposition(decomp) 370 property_object_map['dm'].addDataRecord(cp, cp, mapping) 371 if not uc == '': 372 property_object_map['suc'].addDataRecord(cp, cp, uc) 373 if tc == '': 374 property_object_map['stc'].addDataRecord(cp, cp, uc) 375 if not lc == '': 376 property_object_map['slc'].addDataRecord(cp, cp, lc) 377 if not tc == '': 378 property_object_map['stc'].addDataRecord(cp, cp, tc) 379 property_object_map['na'].finalizeProperty() 380 property_object_map['dm'].finalizeProperty() 381 property_object_map['slc'].finalizeProperty() 382 property_object_map['suc'].finalizeProperty() 383 property_object_map['stc'].finalizeProperty() 384 -
icGREP/icgrep-devel/UCD-scripts/UCD_properties.py
r5661 r5662 44 44 45 45 46 def emit_string_property(f, property_code, null_set, reflexive_set, string_values): 47 f.write(" namespace %s_ns {\n" % property_code.upper()) 48 f.write(" /** Code Point Ranges for %s mapping to <none> \n " % property_code) 49 f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(null_set)], ',', 8)) 50 f.write("**/\n") 51 f.write(" const UnicodeSet null_codepoint_set \n") 52 f.write(null_set.showC(12) + ";\n") 53 f.write(" /** Code Point Ranges for %s mapping to <codepoint> \n " % property_code) 54 f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8)) 55 f.write("**/\n") 56 f.write(" const UnicodeSet reflexive_set \n") 57 f.write(reflexive_set.showC(12) + ";\n") 58 f.write(" const unsigned buffer_length = %s;\n" % string_values.len()) 59 f.write(" const char * string_buffer = u8 R\"__(%s)__\";\n") 60 f.write(" static StringPropertyObject property_object{%s, null_codepoint_set, reflexive_set, string_buffer, buffer_length};\n }\n" % property_code) 46 def emit_string_property(f, property_code, null_set, reflexive_set, cp_value_map): 47 s = string.Template(r""" namespace ${prop_enum_up}_ns { 48 /** Code Point Ranges for ${prop_enum} mapping to <none> 49 ${null_set_ranges}**/ 50 51 const UnicodeSet null_codepoint_set 52 ${null_set_value}; 53 54 /** Code Point Ranges for ${prop_enum} mapping to <codepoint> 55 ${reflexive_set_ranges}**/ 56 const UnicodeSet reflexive_set 57 ${reflexive_set_value}; 58 59 const unsigned buffer_length = ${buffer_length}; 60 const static char __attribute__ ((aligned (32))) string_buffer[${allocation_length}] = u8R"__(${string_buffer})__"; 61 62 const static std::vector<codepoint_t> defined_cps = { 63 ${explicitly_defined_cps}}; 64 static StringPropertyObject property_object(${prop_enum}, 65 null_codepoint_set, 66 reflexive_set, 67 static_cast<const char *>(string_buffer), 68 buffer_length, 69 defined_cps); 70 } 71 """) 72 cps = sorted(cp_value_map.keys()) 73 string_buffer = "" 74 for cp in cps: 75 string_buffer += cp_value_map[cp] + "\n" 76 buffer_length = len(string_buffer.encode("utf-8")) 77 f.write(s.substitute(prop_enum = property_code, 78 prop_enum_up = property_code.upper(), 79 string_buffer = string_buffer, 80 buffer_length = buffer_length, 81 allocation_length = (buffer_length + 255) & -256, 82 null_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(null_set)], ',', 8), 83 null_set_value = null_set.showC(12), 84 reflexive_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8), 85 reflexive_set_value = reflexive_set.showC(12), 86 explicitly_defined_cp_count = len(cps), 87 explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8) 88 )) 61 89 62 90 … … 98 126 99 127 def load_property_name_info(self): 100 #(self.property_enum_name_list, self.full_name_map, self.property_lookup_map, self.property_kind_map) = parse_PropertyAlias_txt()101 128 (self.property_enum_name_list, self.property_object_map) = parse_PropertyAlias_txt() 102 129 self.property_lookup_map = getPropertyLookupMap(self.property_object_map) … … 169 196 emit_enumerated_property(f, property_code, independent_prop_values, prop_values, property_object.value_map) 170 197 print("%s: %s bytes" % (property_object.getPropertyFullName(), sum([property_object.value_map[v].bytes() for v in property_object.value_map.keys()]))) 171 #elif isinstance(property_object, StringPropertyObject):172 # emit_string_property(f, property_code, property_object.value_map)198 elif isinstance(property_object, StringPropertyObject): 199 emit_string_property(f, property_code, property_object.null_str_set, property_object.reflexive_set, property_object.cp_value_map) 173 200 174 201 def generate_property_value_file(self, filename_root, property_code): … … 215 242 self.property_data_headers.append(basename) 216 243 244 def generate_UnicodeData_h(self): 245 basename = 'UnicodeData' 246 parse_UnicodeData_txt(self.property_object_map) 247 f = cformat.open_header_file_for_write(basename) 248 cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"']) 249 prop_code_list = ['na', 'dm', 'suc', 'slc', 'stc'] 250 f.write("\nnamespace UCD {\n") 251 for p in prop_code_list: 252 self.emit_property(f, p) 253 property_object = self.property_object_map[p] 254 self.supported_props.append(p) 255 f.write("}\n\n") 256 cformat.close_header_file(f) 257 self.property_data_headers.append(basename) 258 217 259 def generate_ScriptExtensions_h(self): 218 260 filename_root = 'ScriptExtensions' … … 220 262 extension_object = self.property_object_map['scx'] 221 263 extension_object.setBaseProperty(self.property_object_map['sc']) 222 parse_ ScriptExtensions_txt(extension_object)264 parse_property_data(extension_object, filename_root+'.txt') 223 265 basename = os.path.basename(filename_root) 224 266 f = cformat.open_header_file_for_write(basename) … … 289 331 # Next parse all property value names and their aliases. Generate the data. 290 332 ucd.load_property_value_info() 333 334 ucd.generate_UnicodeData_h() 291 335 # 292 336 # The Age property -
icGREP/icgrep-devel/UCD-scripts/UCD_property_objects.py
r5659 r5662 118 118 if not enum_code in self.name_list_order: self.name_list_order.append(enum_code) 119 119 120 def emit():121 f.write("\nnamespace UCD {\n")122 f.write(" namespace %s_ns {\n" % self.property_code.upper())123 #f.write(" const unsigned independent_prop_values = %s;\n" % self.independent_prop_values)124 for v in self.property_values:125 f.write(" /** Code Point Ranges for %s\n " % v)126 f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 4))127 f.write("**/\n")128 f.write(" const UnicodeSet %s_Set \n" % v.lower())129 f.write(self.value_map[v].showC(8) + ";\n")130 print("%s: %s bytes" % (basename, sum([self.value_map[v].bytes() for v in self.value_map.keys()])))131 set_list = ['&%s_Set' % v.lower() for v in self.property_values]132 f.write(" static EnumeratedPropertyObject property_object\n")133 f.write(" {%s,\n" % self.property_code)134 f.write(" %s_ns::independent_prop_values,\n" % self.property_code.upper())135 f.write(" %s_ns::enum_names,\n" % self.property_code.upper())136 f.write(" %s_ns::value_names,\n" % self.property_code.upper())137 f.write(" %s_ns::aliases_only_map,\n" % self.property_code.upper())138 f.write(" {")139 f.write(cformat.multiline_fill(set_list, ',', 8))140 f.write("\n }};\n }\n}\n")141 142 120 class BinaryPropertyObject(PropertyObject): 143 121 def __init__(self): … … 197 175 self.value_map[k] = uset_union(self.value_map[k], base_set) 198 176 177 codepoint_String_regexp = re.compile("^[A-F0-9]{4,6}(?: [A-F0-9]{4,6})*$") 199 178 class StringPropertyObject(PropertyObject): 200 179 def __init__(self): 201 180 PropertyObject.__init__(self) 202 self.str_value_map = {} 203 181 self.cp_value_map = {} 182 self.null_str_set = empty_uset() 183 self.reflexive_set = empty_uset() 184 204 185 def getPropertyKind(self): 205 186 if self.property_code in ['scf', 'slc', 'suc', 'stc']: … … 209 190 210 191 def addDataRecord(self, cp_lo, cp_hi, stringValue): 211 if not self.property_code in ['na', 'JSN', 'na1', 'isc'] and stringValue != '': 212 s = "" 213 for cp in [int(x, 16) for x in stringValue.split(' ')]: 214 s+= chr(cp) 215 stringValue = s 216 for cp in range(cp_lo, cp_hi+1): 217 self.str_value_map[cp] = stringValue 192 if stringValue == '': 193 self.null_str_set = uset_union(self.null_str_set, range_uset(cp_lo, cp_hi)) 194 else: 195 if codepoint_String_regexp.match(stringValue): 196 s = "" 197 for cp in [int(x, 16) for x in stringValue.split(' ')]: 198 s += chr(cp) 199 stringValue = s 200 for cp in range(cp_lo, cp_hi+1): 201 if len(stringValue) == 1 and ord(stringValue[0]) == cp: 202 print("Found reflexive entry for %s: %s" % (self.property_code, stringValue)) 203 self.reflexive_set = uset_union(self.reflexive_set, singleton_uset(ord(stringValue[0]))) 204 else: 205 self.cp_value_map[cp] = stringValue 206 207 def finalizeProperty(self): 208 explicitly_defined_cps = empty_uset() 209 for cp in self.cp_value_map.keys(): 210 explicitly_defined_cps = uset_union(explicitly_defined_cps, singleton_uset(cp)) 211 # set <script> default 212 if self.default_value == "<code point>": 213 self.reflexive_set = uset_union(self.reflexive_set, uset_complement(uset_union(explicitly_defined_cps, self.null_str_set))) 214 else: 215 self.null_str_set = uset_union(self.null_str_set, uset_complement(uset_union(explicitly_defined_cps, self.reflexive_set))) 218 216 219 217 def getPropertyLookupMap(property_object_map):
Note: See TracChangeset
for help on using the changeset viewer.