Changeset 4177
 Timestamp:
 Sep 17, 2014, 3:28:25 PM (5 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

proto/charsetcompiler/UCD/unicode_set.py
r4176 r4177 13 13 # 14 14 # The Unicode Sparse Bitset representation is based on 15 # (a) Dividing the Unicode codepoint space into groups of 64codepoints called quads.15 # (a) Dividing the Unicode codepoint space into groups of 2^k codepoints called quads. 16 16 # (b) Specifying the quads using a runlength encoding, in which each run 17 17 # is Empty (quads contain no members), Mixed (quads contain some members and … … 86 86 87 87 88 # helper89 def advance_run_list(r, n):90 (runtype, runlength) = r[0]91 if runlength > n: return [(runtype, runlength  n)] + r[1:]92 elif runlength == n: return r[1:]93 else: return advance_run_list(r[1:], n  runlength)94 95 96 88 # 97 89 # Set Operations … … 99 91 def empty_set(log2_quad_bits = default_log_2_quad_bits): 100 92 e = UCset(log2_quad_bits) 101 e.runs = [(Empty, UnicodeQuadCount)]93 e.runs = [(Empty, e.UnicodeQuadCount)] 102 94 e.quads = [] 103 e.quad_count = UnicodeQuadCount95 e.quad_count = e.UnicodeQuadCount 104 96 return e 105 97 … … 133 125 134 126 127 class Uset_Iterator: 128 def __init__(self, uSet): 129 self.uSet = uSet 130 self.run_no = 0 131 self.offset = 0 132 self.quad_no = 0 133 def at_end(self): 134 return self.run_no == len(self.uSet.runs) 135 def current_run(self): 136 (this_run_type, this_run_length) = self.uSet.runs[self.run_no] 137 return (this_run_type, this_run_length  self.offset) 138 def get_quad(self): 139 (this_run_type, this_run_length) = self.uSet.runs[self.run_no] 140 if this_run_type == Empty: return 0 141 elif this_run_type == Full: return self.uSet.FullQuadMask 142 else: return self.uSet.quads[self.quad_no] 143 def advance(self, n): 144 while n > 0: 145 (this_run_type, this_run_length) = self.uSet.runs[self.run_no] 146 remain = this_run_length  self.offset 147 if remain > n: 148 self.offset += n 149 if this_run_type == Mixed: self.quad_no += n 150 n = 0 151 elif remain == n: 152 self.run_no += 1 153 self.offset = 0 154 if this_run_type == Mixed: self.quad_no += n 155 n = 0 156 else: 157 self.run_no += 1 158 self.offset = 0 159 if this_run_type == Mixed: self.quad_no += remain 160 n = remain 161 135 162 def complement (s): 136 163 assert s.quad_count == s.UnicodeQuadCount 137 164 iset = UCset(s.log2_quad_bits) 165 it = Uset_Iterator(s) 138 166 R = s.runs 139 167 Q = s.quads 140 while R != []:141 (runtype, n) = R[0]168 while not it.at_end(): 169 (runtype, n) = it.current_run() 142 170 if runtype == Empty: 143 171 iset.append_run(Full, n) 172 it.advance(n) 144 173 elif runtype == Full: 145 174 iset.append_run(Empty, n) 175 it.advance(n) 146 176 else: 147 iset.append_mixed_run(n, [s.FullQuadMask ^ q for q in Q[0:n]]) 148 Q = Q[n:] 149 R = advance_run_list(R, n) 150 return iset 177 for i in range(n): 178 iset.append_quad(s.FullQuadMask ^ it.get_quad()) 179 it.advance(1) 180 return iset 181 151 182 152 183 def intersect (s1, s2): … … 154 185 assert s2.quad_count == s1.UnicodeQuadCount 155 186 iset = UCset() 156 r1 = s1.runs 157 q1 = s1.quads 158 r2 = s2.runs 159 q2 = s2.quads 160 while r1 != []: 161 (s1_type, s1_length) = r1[0] 162 (s2_type, s2_length) = r2[0] 187 i1 = Uset_Iterator(s1) 188 i2 = Uset_Iterator(s2) 189 while not i1.at_end(): 190 (s1_type, s1_length) = i1.current_run() 191 (s2_type, s2_length) = i2.current_run() 163 192 n = min(s1_length, s2_length) 164 193 if s1_type == Empty or s2_type == Empty: 165 194 iset.append_run(Empty, n) 195 i1.advance(n) 196 i2.advance(n) 166 197 elif s1_type == Full and s2_type == Full: 167 198 iset.append_run(Full, n) 199 i1.advance(n) 200 i2.advance(n) 168 201 elif s1_type == Full: 169 iset.append_mixed_run(n, q2[0:n]) 170 q2 = q2[n:] 202 for i in range(n): 203 iset.append_quad(i2.get_quad()) 204 i2.advance(1) 205 i1.advance(n) 171 206 elif s2_type == Full: 172 iset.append_mixed_run(n, q1[0:n]) 173 q1 = q1[n:] 207 for i in range(n): 208 iset.append_quad(i1.get_quad()) 209 i1.advance(1) 210 i2.advance(n) 174 211 else: # both s1 and s2 have mixed blocks; form blockbyblock intersection 175 212 for i in range(n): 176 iset.append_quad(q1[i] & q2[i]) 177 q1 = q1[n:] 178 q2 = q2[n:] 179 r1 = advance_run_list(r1, n) 180 r2 = advance_run_list(r2, n) 213 iset.append_quad(i1.get_quad() & i2.get_quad()) 214 i1.advance(1) 215 i2.advance(1) 181 216 return iset 182 217 … … 185 220 assert s2.quad_count == s1.UnicodeQuadCount 186 221 iset = UCset() 187 r1 = s1.runs 188 q1 = s1.quads 189 r2 = s2.runs 190 q2 = s2.quads 191 while r1 != []: 192 (s1_type, s1_length) = r1[0] 193 (s2_type, s2_length) = r2[0] 222 i1 = Uset_Iterator(s1) 223 i2 = Uset_Iterator(s2) 224 while not i1.at_end(): 225 (s1_type, s1_length) = i1.current_run() 226 (s2_type, s2_length) = i2.current_run() 194 227 n = min(s1_length, s2_length) 195 228 if s1_type == Empty and s2_type == Empty: 196 229 iset.append_run(Empty, n) 230 i1.advance(n) 231 i2.advance(n) 197 232 elif s1_type == Full or s2_type == Full: 198 233 iset.append_run(Full, n) 234 i1.advance(n) 235 i2.advance(n) 199 236 elif s1_type == Empty: 200 iset.append_mixed_run(n, q2[0:n]) 201 q2 = q2[n:] 237 for i in range(n): 238 iset.append_quad(i2.get_quad()) 239 i2.advance(1) 240 i1.advance(n) 202 241 elif s2_type == Empty: 203 iset.append_mixed_run(n, q1[0:n]) 204 q1 = q1[n:] 242 for i in range(n): 243 iset.append_quad(i1.get_quad()) 244 i1.advance(1) 245 i2.advance(n) 205 246 else: # both s1 and s2 have mixed blocks; form blockbyblock union 206 247 for i in range(n): 207 iset.append_quad(q1[i]  q2[i]) 208 q1 = q1[n:] 209 q2 = q2[n:] 210 r1 = advance_run_list(r1, n) 211 r2 = advance_run_list(r2, n) 248 iset.append_quad(i1.get_quad()  i2.get_quad()) 249 i1.advance(1) 250 i2.advance(1) 212 251 return iset 213 252 … … 216 255 assert s2.quad_count == s1.UnicodeQuadCount 217 256 iset = UCset() 218 r1 = s1.runs 219 q1 = s1.quads 220 r2 = s2.runs 221 q2 = s2.quads 222 while r1 != []: 223 (s1_type, s1_length) = r1[0] 224 (s2_type, s2_length) = r2[0] 257 i1 = Uset_Iterator(s1) 258 i2 = Uset_Iterator(s2) 259 while not i1.at_end(): 260 (s1_type, s1_length) = i1.current_run() 261 (s2_type, s2_length) = i2.current_run() 225 262 n = min(s1_length, s2_length) 226 263 if s1_type == Empty or s2_type == Full: 227 264 iset.append_run(Empty, n) 265 i1.advance(n) 266 i2.advance(n) 228 267 elif s1_type == Full and s2_type == Empty: 229 268 iset.append_run(Full, n) 269 i1.advance(n) 270 i2.advance(n) 271 elif s1_type == Full: 272 for i in range(n): 273 iset.append_quad(iset.FullQuadMask ^ i2.get_quad()) 274 i2.advance(1) 275 i1.advance(n) 230 276 elif s2_type == Empty: 231 iset.append_mixed_run(n, q1[0:n]) 232 q1 = q1[n:] 233 elif s1_type == Full: 234 iset.append_mixed_run(n, [s1.FullQuadMask ^ q for q in q2[0:n]]) 235 q2 = q2[n:] 236 else: # both s1 and s2 have mixed blocks; form blockbyblock difference 237 for i in range(n): 238 iset.append_quad(q1[i] &~ q2[i]) 239 q1 = q1[n:] 240 q2 = q2[n:] 241 r1 = advance_run_list(r1, n) 242 r2 = advance_run_list(r2, n) 243 return iset 244 277 for i in range(n): 278 iset.append_quad(i1.get_quad()) 279 i1.advance(1) 280 i2.advance(n) 281 else: # both s1 and s2 have mixed blocks; form blockbyblock union 282 for i in range(n): 283 iset.append_quad(i1.get_quad() &~ i2.get_quad()) 284 i1.advance(1) 285 i2.advance(1) 286 return iset 245 287 246 288 def symmetric_difference (s1, s2): … … 248 290 assert s2.quad_count == s1.UnicodeQuadCount 249 291 iset = UCset() 250 r1 = s1.runs 251 q1 = s1.quads 252 r2 = s2.runs 253 q2 = s2.quads 254 while r1 != []: 255 (s1_type, s1_length) = r1[0] 256 (s2_type, s2_length) = r2[0] 292 i1 = Uset_Iterator(s1) 293 i2 = Uset_Iterator(s2) 294 while not i1.at_end(): 295 (s1_type, s1_length) = i1.current_run() 296 (s2_type, s2_length) = i2.current_run() 257 297 n = min(s1_length, s2_length) 258 298 if s1_type == Empty and s2_type == Full or s1_type == Full and s2_type == Empty: … … 261 301 iset.append_run(Empty, n) 262 302 elif s1_type == Empty: 263 iset.append_mixed_run(n, q2[0:n]) 264 q2 = q2[n:] 303 for i in range(n): 304 iset.append_quad(i2.get_quad()) 305 i2.advance(1) 306 i1.advance(n) 265 307 elif s2_type == Empty: 266 iset.append_mixed_run(n, q1[0:n]) 267 q1 = q1[n:] 308 for i in range(n): 309 iset.append_quad(i1.get_quad()) 310 i1.advance(1) 311 i2.advance(n) 268 312 elif s1_type == Full: 269 iset.append_mixed_run(n, [s1.FullQuadMask ^ q for q in q2[0:n]]) 270 q2 = q2[n:] 313 for i in range(n): 314 iset.append_quad(iset.FullQuadMask ^ i2.get_quad()) 315 i2.advance(1) 316 i1.advance(n) 271 317 elif s2_type == Full: 272 iset.append_mixed_run(n, [s1.FullQuadMask ^ q for q in q1[0:n]])273 q1 = q1[n:]274 else: # both s1 and s2 have mixed blocks; form blockbyblock symmetric difference275 for i in range(n):276 iset.append_quad(q1[i] ^ q2[i])277 q1 = q1[n:]278 q2 = q2[n:]279 r1 = advance_run_list(r1, n)280 r2 = advance_run_list(r2, n)318 for i in range(n): 319 iset.append_quad(iset.FullQuadMask ^ i1.get_quad()) 320 i1.advance(1) 321 i2.advance(n) 322 else: # both s1 and s2 have mixed blocks; form blockbyblock union 323 for i in range(n): 324 iset.append_quad(i1.get_quad() ^ i2.get_quad()) 325 i1.advance(1) 326 i2.advance(1) 281 327 return iset 282 328 … … 309 355 310 356 311 312
Note: See TracChangeset
for help on using the changeset viewer.