source: icGREP/icgrep-devel/UCD-scripts/UnicodeNameData.py @ 5671

Last change on this file since 5671 was 5655, checked in by cameron, 23 months ago

UCD_Config.h, add Indic and CompositionExclusion? properties

File size: 1.9 KB
RevLine 
[4948]1#
2# UnicodeNameData.py
3#
4# Robert D. Cameron
5# March 2, 2016
6#
7# Licensed under Open Software License 3.0.
8#
9#
10import re, string, os.path, cformat
[5143]11import UCD_config
[4948]12from UCD_parser import *
13
14
15UnicodeNameData_cpp_template = r"""
16#include "UnicodeNameData.h"
17const int Unamesize = %s;
[5655]18char __attribute__ ((aligned (32))) Unamedata[Unamesize + %s] = R"___(%s)___";
[4948]19
20char * getUnicodeNameDataPtr() {
21  return Unamedata;
22}
23int getUnicodeNameDataSize() {
24  return Unamesize-1;
25}
26"""
27
[5655]28NonName_regexp = re.compile("<[^>]*>")
[4948]29
30def genUnicodeNameData():
[5655]31    (parsed_data, ranges) = parse_UnicodeData_txt()
[4948]32    name_data_string = ""
33    name_data_len = 0
34    for record in parsed_data:
35        (cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc) = record
[5655]36        if NonName_regexp.match(name): continue   # Skip codepoints whose name field is not actually a name.
37        name_data_string += cp + ";" + name + "\n"
38    # for range_record in ranges:
39    #     (lo_cp, hi_cp, range_name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc) = range_record
40    #     print(lo_cp, hi_cp, range_name)
41    #     if range_name[:13] == "CJK Ideograph":
42    #         for cp in range(int(lo_cp,16), int(hi_cp,16)):
43    #             name_data_string += "%04X;CJK UNIFIED IDEOGRAPH-%04X\n" % (cp, cp)
44    #     elif range_name[:16] == "Tangut Ideograph":
45    #         for cp in range(int(lo_cp,16), int(hi_cp,16)):
46    #             name_data_string += "%04X;TANGUT IDEOGRAPH-%04X\n" % (cp, cp)
47    #     elif range_name[:5] == "Nushu":
48    #         for cp in range(int(lo_cp,16), int(hi_cp,16)):
49    #             name_data_string += "%04X;NUSHU CHARACTER-%04X\n" % (cp, cp)
50    name_data_len = len(name_data_string)
[5143]51    f = open(UCD_config.UCD_output_dir + '/UnicodeNameData.cpp', 'w')
[5642]52    f.write(UnicodeNameData_cpp_template % (name_data_len + 1, 255 - (name_data_len % 256), name_data_string))
[4948]53    f.close()
54
55if __name__ == "__main__":
56  genUnicodeNameData()
Note: See TracBrowser for help on using the repository browser.