Ignore:
Timestamp:
Sep 17, 2014, 3:28:25 PM (5 years ago)
Author:
cameron
Message:

Introduce Uset_Iterator objects

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/UCD/unicode_set.py

    r4176 r4177  
    1313#
    1414# The Unicode Sparse Bitset representation is based on
    15 # (a) Dividing the Unicode codepoint space into groups of 64 codepoints called quads.
     15# (a) Dividing the Unicode codepoint space into groups of 2^k codepoints called quads.
    1616# (b) Specifying the quads using a run-length encoding, in which each run
    1717#     is Empty (quads contain no members), Mixed (quads contain some members and
     
    8686
    8787
    88 # helper
    89 def advance_run_list(r, n):
    90    (runtype, runlength) = r[0]
    91    if runlength > n: return  [(runtype, runlength - n)] + r[1:]
    92    elif runlength == n: return r[1:]
    93    else: return advance_run_list(r[1:], n - runlength)
    94    
    95 
    9688#
    9789# Set Operations
     
    9991def empty_set(log2_quad_bits = default_log_2_quad_bits):
    10092   e = UCset(log2_quad_bits)
    101    e.runs = [(Empty, UnicodeQuadCount)]
     93   e.runs = [(Empty, e.UnicodeQuadCount)]
    10294   e.quads = []
    103    e.quad_count = UnicodeQuadCount
     95   e.quad_count = e.UnicodeQuadCount
    10496   return e
    10597
     
    133125
    134126
     127class Uset_Iterator:
     128    def __init__(self, uSet):
     129        self.uSet = uSet
     130        self.run_no = 0
     131        self.offset = 0
     132        self.quad_no = 0
     133    def at_end(self):
     134        return self.run_no == len(self.uSet.runs)
     135    def current_run(self):
     136        (this_run_type, this_run_length) = self.uSet.runs[self.run_no]
     137        return (this_run_type, this_run_length - self.offset)
     138    def get_quad(self):
     139        (this_run_type, this_run_length) = self.uSet.runs[self.run_no]
     140        if this_run_type == Empty: return 0
     141        elif this_run_type == Full: return self.uSet.FullQuadMask
     142        else: return self.uSet.quads[self.quad_no]
     143    def advance(self, n):
     144        while n > 0:
     145           (this_run_type, this_run_length) = self.uSet.runs[self.run_no]
     146           remain = this_run_length - self.offset
     147           if remain > n:
     148               self.offset += n
     149               if this_run_type == Mixed: self.quad_no += n
     150               n = 0
     151           elif remain == n:
     152               self.run_no += 1
     153               self.offset = 0
     154               if this_run_type == Mixed: self.quad_no += n
     155               n = 0
     156           else:
     157               self.run_no += 1
     158               self.offset = 0
     159               if this_run_type == Mixed: self.quad_no += remain
     160               n -= remain
     161 
    135162def complement (s):
    136163   assert s.quad_count == s.UnicodeQuadCount
    137164   iset = UCset(s.log2_quad_bits)
     165   it = Uset_Iterator(s)
    138166   R = s.runs
    139167   Q = s.quads
    140    while R != []:
    141       (runtype, n) = R[0]
     168   while not it.at_end():
     169      (runtype, n) = it.current_run()
    142170      if runtype == Empty:
    143171         iset.append_run(Full, n)
     172         it.advance(n)
    144173      elif runtype == Full:
    145174         iset.append_run(Empty, n)
     175         it.advance(n)
    146176      else:
    147          iset.append_mixed_run(n, [s.FullQuadMask ^ q for q in Q[0:n]])
    148          Q = Q[n:]
    149       R = advance_run_list(R, n)
    150    return iset
     177         for i in range(n):
     178            iset.append_quad(s.FullQuadMask ^ it.get_quad())
     179            it.advance(1)
     180   return iset
     181
    151182
    152183def intersect (s1, s2):
     
    154185   assert s2.quad_count == s1.UnicodeQuadCount
    155186   iset = UCset()
    156    r1 = s1.runs
    157    q1 = s1.quads
    158    r2 = s2.runs
    159    q2 = s2.quads
    160    while r1 != []:
    161       (s1_type, s1_length) = r1[0]
    162       (s2_type, s2_length) = r2[0]
     187   i1 = Uset_Iterator(s1)
     188   i2 = Uset_Iterator(s2)
     189   while not i1.at_end():
     190      (s1_type, s1_length) = i1.current_run()
     191      (s2_type, s2_length) = i2.current_run()
    163192      n = min(s1_length, s2_length)
    164193      if s1_type == Empty or s2_type == Empty:
    165194         iset.append_run(Empty, n)
     195         i1.advance(n)
     196         i2.advance(n)
    166197      elif s1_type == Full and s2_type == Full:
    167198         iset.append_run(Full, n)
     199         i1.advance(n)
     200         i2.advance(n)
    168201      elif s1_type == Full:
    169          iset.append_mixed_run(n, q2[0:n])
    170          q2 = q2[n:]
     202         for i in range(n):
     203            iset.append_quad(i2.get_quad())
     204            i2.advance(1)
     205         i1.advance(n)
    171206      elif s2_type == Full:
    172          iset.append_mixed_run(n, q1[0:n])
    173          q1 = q1[n:]
     207         for i in range(n):
     208            iset.append_quad(i1.get_quad())
     209            i1.advance(1)
     210         i2.advance(n)
    174211      else: # both s1 and s2 have mixed blocks; form block-by-block intersection
    175212         for i in range(n):
    176            iset.append_quad(q1[i] & q2[i])
    177          q1 = q1[n:]
    178          q2 = q2[n:]
    179       r1 = advance_run_list(r1, n)
    180       r2 = advance_run_list(r2, n)
     213            iset.append_quad(i1.get_quad() & i2.get_quad())
     214            i1.advance(1)
     215            i2.advance(1)
    181216   return iset
    182217
     
    185220   assert s2.quad_count == s1.UnicodeQuadCount
    186221   iset = UCset()
    187    r1 = s1.runs
    188    q1 = s1.quads
    189    r2 = s2.runs
    190    q2 = s2.quads
    191    while r1 != []:
    192       (s1_type, s1_length) = r1[0]
    193       (s2_type, s2_length) = r2[0]
     222   i1 = Uset_Iterator(s1)
     223   i2 = Uset_Iterator(s2)
     224   while not i1.at_end():
     225      (s1_type, s1_length) = i1.current_run()
     226      (s2_type, s2_length) = i2.current_run()
    194227      n = min(s1_length, s2_length)
    195228      if s1_type == Empty and s2_type == Empty:
    196229         iset.append_run(Empty, n)
     230         i1.advance(n)
     231         i2.advance(n)
    197232      elif s1_type == Full or s2_type == Full:
    198233         iset.append_run(Full, n)
     234         i1.advance(n)
     235         i2.advance(n)
    199236      elif s1_type == Empty:
    200          iset.append_mixed_run(n, q2[0:n])
    201          q2 = q2[n:]
     237         for i in range(n):
     238            iset.append_quad(i2.get_quad())
     239            i2.advance(1)
     240         i1.advance(n)
    202241      elif s2_type == Empty:
    203          iset.append_mixed_run(n, q1[0:n])
    204          q1 = q1[n:]
     242         for i in range(n):
     243            iset.append_quad(i1.get_quad())
     244            i1.advance(1)
     245         i2.advance(n)
    205246      else: # both s1 and s2 have mixed blocks; form block-by-block union
    206247         for i in range(n):
    207            iset.append_quad(q1[i] | q2[i])
    208          q1 = q1[n:]
    209          q2 = q2[n:]
    210       r1 = advance_run_list(r1, n)
    211       r2 = advance_run_list(r2, n)
     248            iset.append_quad(i1.get_quad() | i2.get_quad())
     249            i1.advance(1)
     250            i2.advance(1)
    212251   return iset
    213252
     
    216255   assert s2.quad_count == s1.UnicodeQuadCount
    217256   iset = UCset()
    218    r1 = s1.runs
    219    q1 = s1.quads
    220    r2 = s2.runs
    221    q2 = s2.quads
    222    while r1 != []:
    223       (s1_type, s1_length) = r1[0]
    224       (s2_type, s2_length) = r2[0]
     257   i1 = Uset_Iterator(s1)
     258   i2 = Uset_Iterator(s2)
     259   while not i1.at_end():
     260      (s1_type, s1_length) = i1.current_run()
     261      (s2_type, s2_length) = i2.current_run()
    225262      n = min(s1_length, s2_length)
    226263      if s1_type == Empty or s2_type == Full:
    227264         iset.append_run(Empty, n)
     265         i1.advance(n)
     266         i2.advance(n)
    228267      elif s1_type == Full and s2_type == Empty:
    229268         iset.append_run(Full, n)
     269         i1.advance(n)
     270         i2.advance(n)
     271      elif s1_type == Full:
     272         for i in range(n):
     273            iset.append_quad(iset.FullQuadMask ^ i2.get_quad())
     274            i2.advance(1)
     275         i1.advance(n)
    230276      elif s2_type == Empty:
    231          iset.append_mixed_run(n, q1[0:n])
    232          q1 = q1[n:]
    233       elif s1_type == Full:
    234          iset.append_mixed_run(n, [s1.FullQuadMask ^ q for q in q2[0:n]])
    235          q2 = q2[n:]
    236       else: # both s1 and s2 have mixed blocks; form block-by-block difference
    237          for i in range(n):
    238            iset.append_quad(q1[i] &~ q2[i])
    239          q1 = q1[n:]
    240          q2 = q2[n:]
    241       r1 = advance_run_list(r1, n)
    242       r2 = advance_run_list(r2, n)
    243    return iset
    244 
     277         for i in range(n):
     278            iset.append_quad(i1.get_quad())
     279            i1.advance(1)
     280         i2.advance(n)
     281      else: # both s1 and s2 have mixed blocks; form block-by-block union
     282         for i in range(n):
     283            iset.append_quad(i1.get_quad() &~ i2.get_quad())
     284            i1.advance(1)
     285            i2.advance(1)
     286   return iset
    245287
    246288def symmetric_difference (s1, s2):
     
    248290   assert s2.quad_count == s1.UnicodeQuadCount
    249291   iset = UCset()
    250    r1 = s1.runs
    251    q1 = s1.quads
    252    r2 = s2.runs
    253    q2 = s2.quads
    254    while r1 != []:
    255       (s1_type, s1_length) = r1[0]
    256       (s2_type, s2_length) = r2[0]
     292   i1 = Uset_Iterator(s1)
     293   i2 = Uset_Iterator(s2)
     294   while not i1.at_end():
     295      (s1_type, s1_length) = i1.current_run()
     296      (s2_type, s2_length) = i2.current_run()
    257297      n = min(s1_length, s2_length)
    258298      if s1_type == Empty and s2_type == Full or s1_type == Full and s2_type == Empty:
     
    261301         iset.append_run(Empty, n)
    262302      elif s1_type == Empty:
    263          iset.append_mixed_run(n, q2[0:n])
    264          q2 = q2[n:]
     303         for i in range(n):
     304            iset.append_quad(i2.get_quad())
     305            i2.advance(1)
     306         i1.advance(n)
    265307      elif s2_type == Empty:
    266          iset.append_mixed_run(n, q1[0:n])
    267          q1 = q1[n:]
     308         for i in range(n):
     309            iset.append_quad(i1.get_quad())
     310            i1.advance(1)
     311         i2.advance(n)
    268312      elif s1_type == Full:
    269          iset.append_mixed_run(n, [s1.FullQuadMask ^ q for q in q2[0:n]])
    270          q2 = q2[n:]
     313         for i in range(n):
     314            iset.append_quad(iset.FullQuadMask ^ i2.get_quad())
     315            i2.advance(1)
     316         i1.advance(n)
    271317      elif s2_type == Full:
    272          iset.append_mixed_run(n, [s1.FullQuadMask ^ q for q in q1[0:n]])
    273          q1 = q1[n:]
    274       else: # both s1 and s2 have mixed blocks; form block-by-block symmetric difference
    275          for i in range(n):
    276            iset.append_quad(q1[i] ^ q2[i])
    277          q1 = q1[n:]
    278          q2 = q2[n:]
    279       r1 = advance_run_list(r1, n)
    280       r2 = advance_run_list(r2, n)
     318         for i in range(n):
     319            iset.append_quad(iset.FullQuadMask ^ i1.get_quad())
     320            i1.advance(1)
     321         i2.advance(n)
     322      else: # both s1 and s2 have mixed blocks; form block-by-block union
     323         for i in range(n):
     324            iset.append_quad(i1.get_quad() ^ i2.get_quad())
     325            i1.advance(1)
     326            i2.advance(1)
    281327   return iset
    282328
     
    309355
    310356
    311 
    312 
Note: See TracChangeset for help on using the changeset viewer.