[682] | 1 | #!/usr/bin/python |
---|
| 2 | # -*- coding: utf-8 -*- |
---|
| 3 | |
---|
| 4 | # u8u16.py |
---|
| 5 | # |
---|
| 6 | # Python prototype implementation |
---|
| 7 | # Robert D. Cameron |
---|
| 8 | # Revised October 29, 2009 - Make consistent with character compiler UTF8 definition variable names. |
---|
| 9 | # |
---|
| 10 | #---------------------------------------------------------------------------- |
---|
| 11 | # |
---|
| 12 | # We use python's unlimited precision integers for unbounded bit streams. |
---|
| 13 | # This permits simple logical operations on the entire stream. |
---|
| 14 | # Assumption: bitstreams are little-endian (e.g., as on x86). |
---|
| 15 | # |
---|
| 16 | #---------------------------------------------------------------------------- |
---|
| 17 | # |
---|
| 18 | |
---|
| 19 | import bitutil |
---|
| 20 | |
---|
| 21 | import byteclass |
---|
| 22 | |
---|
| 23 | def ShiftBack(stream): |
---|
| 24 | return stream >> 1 |
---|
| 25 | |
---|
| 26 | def validate_utf8(u8): |
---|
| 27 | u8.scope22 = bitutil.Advance(u8.prefix2) |
---|
| 28 | u8.scope32 = bitutil.Advance(u8.prefix3) |
---|
| 29 | u8.scope33 = bitutil.Advance(u8.scope32) |
---|
| 30 | u8.scope42 = bitutil.Advance(u8.prefix4) |
---|
| 31 | u8.scope43 = bitutil.Advance(u8.scope42) |
---|
| 32 | u8.scope44 = bitutil.Advance(u8.scope43) |
---|
| 33 | |
---|
| 34 | u8lastscope = u8.scope22 | u8.scope33 | u8.scope44 |
---|
| 35 | u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43 |
---|
| 36 | |
---|
| 37 | # C0-C1 and F5-FF are illegal |
---|
| 38 | error_mask = u8.badprefix |
---|
| 39 | |
---|
| 40 | error_mask |= bitutil.Advance(u8.xE0) & u8.x80_x9F |
---|
| 41 | error_mask |= bitutil.Advance(u8.xED) & u8.xA0_xBF |
---|
| 42 | error_mask |= bitutil.Advance(u8.xF0) & u8.x80_x8F |
---|
| 43 | error_mask |= bitutil.Advance(u8.xF4) & u8.x90_xBF |
---|
| 44 | |
---|
| 45 | error_mask |= u8anyscope ^ u8.suffix |
---|
| 46 | u8.error = error_mask |
---|
[684] | 47 | |
---|
[682] | 48 | return u8 |
---|
| 49 | |
---|
| 50 | # |
---|
| 51 | # The following calculation of UTF-16 bit streams is consistent |
---|
| 52 | # with the original u8u16, calculating streams at u8scope42 and |
---|
| 53 | # u8scope44 positions. |
---|
| 54 | # |
---|
| 55 | |
---|
| 56 | def u8_u16_old(u8, u8bit): |
---|
| 57 | u16hi = [0,0,0,0,0,0,0,0] |
---|
| 58 | u16lo = [0,0,0,0,0,0,0,0] |
---|
| 59 | |
---|
| 60 | u8lastscope = u8.scope22 | u8.scope33 | u8.scope44 |
---|
| 61 | u8lastbyte = u8.unibyte | u8lastscope |
---|
| 62 | u16lo[2] = u8lastbyte & u8bit[2] |
---|
| 63 | u16lo[3] = u8lastbyte & u8bit[3] |
---|
| 64 | u16lo[4] = u8lastbyte & u8bit[4] |
---|
| 65 | u16lo[5] = u8lastbyte & u8bit[5] |
---|
| 66 | u16lo[6] = u8lastbyte & u8bit[6] |
---|
| 67 | u16lo[7] = u8lastbyte & u8bit[7] |
---|
| 68 | u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7])) |
---|
| 69 | u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6]) |
---|
| 70 | |
---|
| 71 | u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3]) |
---|
| 72 | u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4]) |
---|
| 73 | u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5]) |
---|
| 74 | u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4])) |
---|
| 75 | u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5])) |
---|
| 76 | u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6])) |
---|
| 77 | u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7])) |
---|
| 78 | u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2]) |
---|
| 79 | |
---|
| 80 | u8surrogate = u8.scope42 | u8.scope44 |
---|
| 81 | u16hi[0] = u16hi[0] | u8surrogate |
---|
| 82 | u16hi[1] = u16hi[1] | u8surrogate |
---|
| 83 | u16hi[3] = u16hi[3] | u8surrogate |
---|
| 84 | u16hi[4] = u16hi[4] | u8surrogate |
---|
| 85 | u16hi[5] = u16hi[5] | u8.scope44 |
---|
| 86 | |
---|
| 87 | s42lo1 = ~u8bit[3] # subtract 1 |
---|
| 88 | u16lo[1] = u16lo[1] | (u8.scope42 & s42lo1) |
---|
| 89 | s42lo0 = u8bit[2] ^ s42lo1 # borrow * |
---|
| 90 | u16lo[0] = u16lo[0] | (u8.scope42 & s42lo0) |
---|
| 91 | borrow1 = s42lo1 & ~u8bit[2] |
---|
| 92 | s42hi7 = bitutil.Advance(u8bit[7]) ^ borrow1 |
---|
| 93 | u16hi[7]= u16hi[7] | (u8.scope42 & s42hi7) |
---|
| 94 | borrow2 = borrow1 & ~bitutil.Advance(u8bit[7]) |
---|
| 95 | s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2 |
---|
| 96 | u16hi[6] = u16hi[6] | (u8.scope42 & s42hi6) |
---|
| 97 | |
---|
| 98 | u16lo[2] = u16lo[2] | (u8.scope42 & u8bit[4]) |
---|
| 99 | u16lo[3] = u16lo[3] | (u8.scope42 & u8bit[5]) |
---|
| 100 | u16lo[4] = u16lo[4] | (u8.scope42 & u8bit[6]) |
---|
| 101 | u16lo[5] = u16lo[5] | (u8.scope42 & u8bit[7]) |
---|
| 102 | u16lo[6] = u16lo[6] | (u8.scope42 & ShiftBack(u8bit[2])) |
---|
| 103 | u16lo[7] = u16lo[7] | (u8.scope42 & ShiftBack(u8bit[3])) |
---|
| 104 | |
---|
| 105 | delmask = u8.prefix | u8.scope32 | u8.scope43 |
---|
| 106 | return (u16hi, u16lo, delmask) |
---|
| 107 | |
---|
| 108 | |
---|
| 109 | |
---|
| 110 | |
---|
| 111 | # |
---|
| 112 | # The following calculation of UTF-16 bit streams uses the |
---|
| 113 | # u8scope43 position rather than the u8scope42 position for |
---|
| 114 | # the bits of the first UTF-16 code unit of a surrogate pair. |
---|
| 115 | # This requires more shifting than with the use of u8scope42, |
---|
| 116 | # but has the advantage that all shifts are in the forward |
---|
| 117 | # direction only and can hence be implemented using addition |
---|
| 118 | # on little-endian architecture. |
---|
| 119 | # |
---|
| 120 | |
---|
| 121 | def u8u16(u8, u8bit): |
---|
| 122 | u16hi = [0,0,0,0,0,0,0,0] |
---|
| 123 | u16lo = [0,0,0,0,0,0,0,0] |
---|
| 124 | |
---|
| 125 | u8lastscope = u8.scope22 | u8.scope33 | u8.scope44 |
---|
| 126 | u8lastbyte = u8.unibyte | u8lastscope |
---|
| 127 | u16lo[2] = u8lastbyte & u8bit[2] |
---|
| 128 | u16lo[3] = u8lastbyte & u8bit[3] |
---|
| 129 | u16lo[4] = u8lastbyte & u8bit[4] |
---|
| 130 | u16lo[5] = u8lastbyte & u8bit[5] |
---|
| 131 | u16lo[6] = u8lastbyte & u8bit[6] |
---|
| 132 | u16lo[7] = u8lastbyte & u8bit[7] |
---|
| 133 | u16lo[1] = (u8.unibyte & u8bit[1]) | (u8lastscope & bitutil.Advance(u8bit[7])) |
---|
| 134 | u16lo[0] = u8lastscope & bitutil.Advance(u8bit[6]) |
---|
| 135 | |
---|
| 136 | u16hi[5] = u8lastscope & bitutil.Advance(u8bit[3]) |
---|
| 137 | u16hi[6] = u8lastscope & bitutil.Advance(u8bit[4]) |
---|
| 138 | u16hi[7] = u8lastscope & bitutil.Advance(u8bit[5]) |
---|
| 139 | u16hi[0] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[4])) |
---|
| 140 | u16hi[1] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[5])) |
---|
| 141 | u16hi[2] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[6])) |
---|
| 142 | u16hi[3] = u8.scope33 & bitutil.Advance(bitutil.Advance(u8bit[7])) |
---|
| 143 | u16hi[4] = u8.scope33 & bitutil.Advance(u8bit[2]) |
---|
| 144 | |
---|
| 145 | u8surrogate = u8.scope43 | u8.scope44 |
---|
| 146 | u16hi[0] = u16hi[0] | u8surrogate |
---|
| 147 | u16hi[1] = u16hi[1] | u8surrogate |
---|
| 148 | u16hi[3] = u16hi[3] | u8surrogate |
---|
| 149 | u16hi[4] = u16hi[4] | u8surrogate |
---|
| 150 | u16hi[5] = u16hi[5] | u8.scope44 |
---|
| 151 | |
---|
| 152 | |
---|
| 153 | s42lo1 = ~u8bit[3] # subtract 1 |
---|
| 154 | u16lo[1] = u16lo[1] | (u8.scope43 & bitutil.Advance(s42lo1)) |
---|
| 155 | s42lo0 = u8bit[2] ^ s42lo1 # borrow * |
---|
| 156 | u16lo[0] = u16lo[0] | (u8.scope43 & bitutil.Advance(s42lo0)) |
---|
| 157 | borrow1 = s42lo1 & ~u8bit[2] |
---|
| 158 | Advance_bit7 = bitutil.Advance(u8bit[7]) |
---|
| 159 | s42hi7 = Advance_bit7 ^ borrow1 |
---|
| 160 | u16hi[7]= u16hi[7] | (u8.scope43 & bitutil.Advance(s42hi7)) |
---|
| 161 | borrow2 = borrow1 & ~Advance_bit7 |
---|
| 162 | s42hi6 = bitutil.Advance(u8bit[6]) ^ borrow2 |
---|
| 163 | u16hi[6] = u16hi[6] | (u8.scope43 & bitutil.Advance(s42hi6)) |
---|
| 164 | |
---|
| 165 | u16lo[2] = u16lo[2] | (u8.scope43 & bitutil.Advance(u8bit[4])) |
---|
| 166 | u16lo[3] = u16lo[3] | (u8.scope43 & bitutil.Advance(u8bit[5])) |
---|
| 167 | u16lo[4] = u16lo[4] | (u8.scope43 & bitutil.Advance(u8bit[6])) |
---|
| 168 | u16lo[5] = u16lo[5] | (u8.scope43 & bitutil.Advance(u8bit[7])) |
---|
| 169 | u16lo[6] = u16lo[6] | (u8.scope43 & u8bit[2]) |
---|
| 170 | u16lo[7] = u16lo[7] | (u8.scope43 & u8bit[3]) |
---|
| 171 | |
---|
| 172 | delmask = u8.prefix | u8.scope32 | u8.scope42 |
---|
| 173 | return (u16hi, u16lo, delmask) |
---|
| 174 | |
---|
| 175 | |
---|
| 176 | |
---|
| 177 | # |
---|
| 178 | # Messages to duplicate u8u16 error reporting. |
---|
| 179 | # |
---|
| 180 | def IllegalSequenceMessage(pos): |
---|
| 181 | return "Illegal UTF-8 sequence at position %i in source.\n" % pos |
---|
| 182 | |
---|
| 183 | def IncompleteSequenceMessage(pos): |
---|
| 184 | return "EOF with incomplete UTF-8 sequence at position %i in source.\n" % pos |
---|
| 185 | |
---|
| 186 | |
---|
| 187 | import sys |
---|
| 188 | def main(): |
---|
| 189 | |
---|
| 190 | if len(sys.argv) < 2: |
---|
| 191 | sys.stderr.write("Usage: u8u16.py u8file [u16file]\n") |
---|
| 192 | exit |
---|
| 193 | if len(sys.argv) == 3: |
---|
| 194 | outfile = open(sys.argv[2],"w") |
---|
| 195 | else: outfile = sys.stdout |
---|
| 196 | u8data = bitutil.readfile(sys.argv[1]) |
---|
| 197 | u8len = len(u8data) |
---|
| 198 | (u8bit, EOF_mask) = bitutil.transpose_streams(u8data) |
---|
| 199 | (u8, control, lex) = byteclass.classify_bytes(u8bit) |
---|
| 200 | u8 = validate_utf8(u8) |
---|
| 201 | if u8.error != 0: |
---|
| 202 | err_pos = bitutil.count_leading_zeroes(u8.error) |
---|
| 203 | at_EOF = err_pos == len(u8data) |
---|
| 204 | if (err_pos >= 1) and ord(u8data[err_pos-1]) >= 0xC0: err_pos -= 1 |
---|
| 205 | elif err_pos >= 2 and ord(u8data[err_pos-2]) >= 0xE0: err_pos -= 2 |
---|
| 206 | elif err_pos >= 3 and ord(u8data[err_pos-3]) >= 0xF0: err_pos -= 3 |
---|
| 207 | if at_EOF: |
---|
| 208 | sys.stderr.write(IncompleteSequenceMessage(err_pos)) |
---|
| 209 | else: |
---|
| 210 | sys.stderr.write(IllegalSequenceMessage(err_pos)) |
---|
| 211 | u8len = err_pos |
---|
| 212 | (u16hi, u16lo, delmask) = u8u16(u8, u8bit) |
---|
| 213 | U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask) |
---|
| 214 | U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask) |
---|
| 215 | U16final = bitutil.merge_bytes(U16H, U16L) |
---|
| 216 | outfile.write(U16final) |
---|
| 217 | outfile.close() |
---|
| 218 | |
---|
| 219 | if __name__ == "__main__": main() |
---|
| 220 | |
---|
| 221 | |
---|