1 | # |
---|
2 | # generate_UCD_tests.py - |
---|
3 | # generating Python pablo functions for various Unicode properties |
---|
4 | # |
---|
5 | # Robert D. Cameron |
---|
6 | # January 31, 2015 |
---|
7 | # |
---|
8 | # Licensed under Open Software License 3.0. |
---|
9 | # |
---|
10 | # |
---|
11 | import re, string, os.path, UCD.cformat |
---|
12 | from random import randint |
---|
13 | from UCD.unicode_set import * |
---|
14 | from UCD.UCD_parser import * |
---|
15 | from if_hierarchy import * |
---|
16 | from string import Template |
---|
17 | |
---|
18 | class UCD_test_generator(): |
---|
19 | def __init__(self): |
---|
20 | self.enum_value_map = {} |
---|
21 | self.binary_value_map = {} |
---|
22 | self.all_good_set = uset_union(range_uset(0x20, 0xD7FF), range_uset(0xE000,0x10FFFF)) |
---|
23 | self.all_good_set = uset_difference(self.all_good_set, singleton_uset(0x85)) |
---|
24 | self.all_good_set = uset_difference(self.all_good_set, range_uset(0x2028,0x2029)) |
---|
25 | |
---|
26 | def load_property_name_info(self): |
---|
27 | (self.property_enum_name_list, self.full_name_map, self.property_lookup_map, self.property_kind_map) = parse_PropertyAlias_txt() |
---|
28 | |
---|
29 | def load_property_value_info(self): |
---|
30 | (self.property_value_list, self.property_value_enum_integer, self.property_value_full_name_map, self.property_value_lookup_map, self.missing_specs) = parse_PropertyValueAlias_txt(self.property_lookup_map) |
---|
31 | |
---|
32 | def load_enumerated_property_data(self, filename_root, property_code): |
---|
33 | vlist = self.property_value_list[property_code] |
---|
34 | canon_map = self.property_value_lookup_map[property_code] |
---|
35 | (prop_values, value_map) = parse_UCD_enumerated_property_map(property_code, vlist, canon_map, filename_root + '.txt') |
---|
36 | self.enum_value_map[property_code] = value_map |
---|
37 | |
---|
38 | def load_ScriptExtensions_data(self): |
---|
39 | filename_root = 'ScriptExtensions' |
---|
40 | property_code = 'scx' |
---|
41 | vlist = self.property_value_list['sc'] |
---|
42 | (prop_values, value_map) = parse_ScriptExtensions_txt(vlist, self.property_value_lookup_map['sc']) |
---|
43 | self.enum_value_map['scx'] = value_map |
---|
44 | |
---|
45 | def load_binary_properties_data(self, filename_root): |
---|
46 | (props, prop_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', self.property_lookup_map) |
---|
47 | for p in props: |
---|
48 | self.binary_value_map[p] = prop_map[p] |
---|
49 | |
---|
50 | def load_others(self): |
---|
51 | self.others = ['Alphabetic', 'Uppercase', 'Lowercase', 'White_Space', 'Noncharacter_Code_Point', 'Default_Ignorable_Code_Point', 'ANY', 'ASCII', 'ASSIGNED'] |
---|
52 | self.binary_value_map['ANY'] = range_uset(0, 0x10FFFF) |
---|
53 | self.binary_value_map['ASCII'] = range_uset(0, 0x7F) |
---|
54 | self.binary_value_map['ASSIGNED'] = uset_complement(self.enum_value_map['gc']['Cn']) |
---|
55 | self.binary_value_map['White_Space'] = self.binary_value_map['WSpace'] |
---|
56 | self.binary_value_map['Uppercase'] = self.binary_value_map['Upper'] |
---|
57 | self.binary_value_map['Lowercase'] = self.binary_value_map['Lower'] |
---|
58 | self.binary_value_map['Alphabetic'] = self.binary_value_map['Alpha'] |
---|
59 | self.binary_value_map['Noncharacter_Code_Point'] = self.binary_value_map['NChar'] |
---|
60 | self.binary_value_map['Default_Ignorable_Code_Point'] = self.binary_value_map['DI'] |
---|
61 | |
---|
62 | def load_all(self): |
---|
63 | # First parse all property names and their aliases |
---|
64 | self.load_property_name_info() |
---|
65 | # |
---|
66 | # Next parse all property value names and their aliases. Generate the data. |
---|
67 | self.load_property_value_info() |
---|
68 | # |
---|
69 | # The Block property |
---|
70 | self.load_enumerated_property_data('Blocks', 'blk') |
---|
71 | # |
---|
72 | # Scripts |
---|
73 | self.load_enumerated_property_data('Scripts', 'sc') |
---|
74 | # |
---|
75 | # Script Extensions |
---|
76 | self.load_ScriptExtensions_data() |
---|
77 | # |
---|
78 | # General Category |
---|
79 | self.load_enumerated_property_data('extracted/DerivedGeneralCategory', 'gc') |
---|
80 | # |
---|
81 | # Core Properties |
---|
82 | self.load_binary_properties_data('DerivedCoreProperties') |
---|
83 | # |
---|
84 | self.load_binary_properties_data('PropList') |
---|
85 | self.load_others() |
---|
86 | |
---|
87 | def generate_level_1_property_terms(self, negated_per_10 = 5, propgroups=['others', 'sc', 'scx', 'gc']): |
---|
88 | template = r"""<grepcase regexp="^\%s{%s}$" datafile="All_good" grepcount="%i"/>""" |
---|
89 | terms = [] |
---|
90 | if 'others' in propgroups: |
---|
91 | for p in self.others: |
---|
92 | s = self.binary_value_map[p] |
---|
93 | lbl = 'p' |
---|
94 | if randint(1,10) <= negated_per_10: |
---|
95 | s = uset_complement(s) |
---|
96 | lbl = 'P' |
---|
97 | terms.append(template % (lbl, p, uset_popcount(uset_intersection(self.all_good_set, s)))) |
---|
98 | if 'gc' in propgroups: |
---|
99 | for v in self.property_value_list['gc']: |
---|
100 | s = self.enum_value_map['gc'][v] |
---|
101 | lbl = 'p' |
---|
102 | if randint(1,10) <= negated_per_10: |
---|
103 | s = uset_complement(s) |
---|
104 | lbl = 'P' |
---|
105 | terms.append(template % (lbl, v, uset_popcount(uset_intersection(self.all_good_set, s)))) |
---|
106 | if 'sc' in propgroups: |
---|
107 | for v in self.property_value_list['sc']: |
---|
108 | s = self.enum_value_map['sc'][v] |
---|
109 | vname = self.property_value_full_name_map['sc'][v] |
---|
110 | lbl = 'p' |
---|
111 | if randint(1,10) <= negated_per_10: |
---|
112 | s = uset_complement(s) |
---|
113 | lbl = 'P' |
---|
114 | terms.append(template % (lbl, vname, uset_popcount(uset_intersection(self.all_good_set, s)))) |
---|
115 | if 'scx' in propgroups: |
---|
116 | for v in self.property_value_list['sc']: |
---|
117 | s = self.enum_value_map['scx'][v] |
---|
118 | vname = self.property_value_full_name_map['sc'][v] |
---|
119 | lbl = 'p' |
---|
120 | if randint(1,10) <= negated_per_10: |
---|
121 | s = uset_complement(s) |
---|
122 | lbl = 'P' |
---|
123 | terms.append(template % (lbl, "scx=" + vname, uset_popcount(uset_intersection(self.all_good_set, s)))) |
---|
124 | return terms |
---|
125 | |
---|
126 | def random_binary(self, a1, a2, useLookbehindAssertions = False): |
---|
127 | (p1, t1) = a1 |
---|
128 | (p2, t2) = a2 |
---|
129 | op = randint(0,2) |
---|
130 | s1 = self.enum_value_map[p1][t1] |
---|
131 | if p2 == 'others': |
---|
132 | s2 = self.binary_value_map[t2] |
---|
133 | else: s2 = self.enum_value_map[p2][t2] |
---|
134 | if op == 0: s3 = uset_intersection(s1, s2) |
---|
135 | elif op == 1: s3 = uset_difference(s1, s2) |
---|
136 | elif op == 2: s3 = uset_union(s1, s2) |
---|
137 | s3 = uset_intersection(s3, self.all_good_set) |
---|
138 | if p1 == 'sc' or p1 == 'scx': t1 = self.property_value_full_name_map['sc'][t1] |
---|
139 | if p2 == 'sc' or p2 == 'scx': t2 = self.property_value_full_name_map['sc'][t2] |
---|
140 | if p1 == 'scx': t1 = 'scx=' + t1 |
---|
141 | if p2 == 'scx': t2 = 'scx=' + t2 |
---|
142 | v1 = "\\p{%s}" % (t1) |
---|
143 | v2 = "\\p{%s}" % (t2) |
---|
144 | if not useLookbehindAssertions: |
---|
145 | opr = ["&&", "--", ""][op] |
---|
146 | return r"""<grepcase regexp="^[%s%s%s]$" datafile="All_good" grepcount="%i"/>""" % (v1, opr, v2, uset_popcount(s3)) |
---|
147 | if op == 0: |
---|
148 | return r"""<grepcase regexp="^%s(?<=%s)$" datafile="All_good" grepcount="%i"/>""" % (v1, v2, uset_popcount(s3)) |
---|
149 | elif op == 1: |
---|
150 | return r"""<grepcase regexp="^%s(?<!%s)$" datafile="All_good" grepcount="%i"/>""" % (v1, v2, uset_popcount(s3)) |
---|
151 | else: |
---|
152 | return r"""<grepcase regexp="^[%s%s]$" datafile="All_good" grepcount="%i"/>""" % (v1, v2, uset_popcount(s3)) |
---|
153 | |
---|
154 | def generate_random_property_expressions(self, useLookbehindAssertions = False): |
---|
155 | gc = self.property_value_list['gc'] |
---|
156 | sc = self.property_value_list['sc'] |
---|
157 | others = ['Alphabetic', 'Uppercase', 'Lowercase', 'White_Space', 'Noncharacter_Code_Point', 'Default_Ignorable_Code_Point', 'ANY', 'ASCII', 'ASSIGNED'] |
---|
158 | exprs = [] |
---|
159 | for p in gc: |
---|
160 | s = sc[randint(0, len(sc)-1)] |
---|
161 | exprs.append(self.random_binary(('gc', p), ('sc', s), useLookbehindAssertions)) |
---|
162 | #sx = sc[randint(0, len(sc)-1)] |
---|
163 | #exprs.append(self.random_binary(('gc', p), ('scx', sx), useLookbehindAssertions)) |
---|
164 | #othr = others[randint(0, len(others)-1)] |
---|
165 | #exprs.append(self.random_binary(('gc', p), ('others', othr), useLookbehindAssertions)) |
---|
166 | for p in sc: |
---|
167 | g = gc[randint(0, len(gc)-1)] |
---|
168 | exprs.append(self.random_binary(('sc', p), ('gc', g), useLookbehindAssertions)) |
---|
169 | #sx = sc[randint(0, len(sc)-1)] |
---|
170 | #exprs.append(self.random_binary(('sc', p), ('scx', sx), useLookbehindAssertions)) |
---|
171 | #othr = others[randint(0, len(others)-1)] |
---|
172 | #exprs.append(self.random_binary(('sc', p), ('others', othr), useLookbehindAssertions)) |
---|
173 | #for p in others: |
---|
174 | #s = sc[randint(0, len(sc)-1)] |
---|
175 | #exprs.append(self.random_binary(('sc', s), ('others', p), useLookbehindAssertions)) |
---|
176 | #sx = sc[randint(0, len(sc)-1)] |
---|
177 | #exprs.append(self.random_binary(('scx', sx), ('others', p), useLookbehindAssertions)) |
---|
178 | return exprs |
---|
179 | |
---|
180 | def UCD_main(): |
---|
181 | ucd = UCD_test_generator() |
---|
182 | ucd.load_all() |
---|
183 | print "<greptest>" |
---|
184 | for t in ucd.generate_level_1_property_terms(1, ['sc', 'gc']): |
---|
185 | print t |
---|
186 | for p in ucd.generate_random_property_expressions(True): |
---|
187 | print p |
---|
188 | print "</greptest>" |
---|
189 | |
---|
190 | if __name__ == "__main__": |
---|
191 | set_UCD_dir('UCD/8.0.0') |
---|
192 | UCD_main() |
---|