Changeset 169 for trunk/src/bitlex.c


Ignore:
Timestamp:
Jun 24, 2008, 5:30:22 PM (11 years ago)
Author:
lindanl
Message:

UTF-16/32 validation.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/bitlex.c

    r133 r169  
    699699
    700700void UTF_16_Lexer::Do_CharsetValidation() {
    701 #ifdef X16HILO_ACCESS
    702         int packs = (buffer_units - 1)/PACKSIZE + 1;
    703         BytePack surrogate_select;
    704         BytePack hi_surrogate;
    705         BytePack lo_surrogate;
    706         BytePack hi_surrogate_pending = simd_const_8(0);
    707         BytePack surrogate_scope;
    708         BytePack u16_surrogate_accum = simd_const_8(0);
    709         BytePack u16_FFFE_FFFF_accum = simd_const_8(0);
    710         BytePack u16_FFFE_FFFF;
    711         for (int pk = 0; pk < packs; pk++) {
    712                 /* UTF-16 code units in the range D800-DBFF and DC00-DFFF are
    713                    reserved for the first and second elements, respectively
    714                    of surrogate pairs.  Validation requires that these values
    715                    only occur in well-formed pairs. */
    716                 surrogate_select = simd_and(x16hi[pk], simd_const_8(0xDC));
    717                 hi_surrogate = simd_eq_8(surrogate_select, simd_const_8(0xD8));
    718                 lo_surrogate = simd_eq_8(surrogate_select, simd_const_8(0xDC));
    719                 surrogate_scope = simd_or(hi_surrogate_pending,
    720                                           sisd_sfli(hi_surrogate, 8));
    721                 u16_surrogate_accum = simd_or(u16_surrogate_accum,
    722                                               simd_xor(surrogate_scope, lo_surrogate));
    723                 hi_surrogate_pending = sisd_sbli(hi_surrogate, 8 * (PACKSIZE-1));
    724                 /* The values FFFE and FFFF are excluded. */
    725                 u16_FFFE_FFFF = simd_eq_8(simd_and(x16hi[pk],
    726                                                    simd_or(x16lo[pk], simd_const_8(1))),
    727                                           simd_const_8(0xFF));
    728                 u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
    729         }
    730         u16_surrogate_accum = simd_or(u16_surrogate_accum, hi_surrogate_pending);
    731         if (bitblock_has_bit(simd_or(u16_surrogate_accum, u16_FFFE_FFFF_accum))) {
    732                 CharSetValidationError("UTF-16");
    733         }
    734 #endif
    735 #ifndef X16HILO_ACCESS
    736         printf("UTF_16_Lexer::Do_CharsetValidation not yet complete; assuming OK.\n");
    737 #endif
    738701};
    739702
    740703
    741704void UCS_2_Lexer::Do_CharsetValidation() {
    742 #ifdef X16HILO_ACCESS
    743         int packs = (buffer_units - 1)/PACKSIZE + 1;
    744         BytePack u16_surrogate_accum = simd_const_8(0);
    745         BytePack u16_FFFE_FFFF_accum = simd_const_8(0);
    746         BytePack u16_FFFE_FFFF;
    747         for (int pk = 0; pk < packs; pk++) {
    748                 /* The high byte of UCS-2 code units cannot be in the range D8-DF.
    749                    This corresponds to the D800-DFFF range of illegal codepoints
    750                    reserved for UTF-16 surrogate pairs. Accumulate the results.
    751                    To check, 0x20 is added to each such octet, mapping the D8-DF
    752                    range to F8-FF and wrapping E0-FF values around.  The max value
    753                    is then accumulated.  */
    754                 u16_surrogate_accum =
    755                         simd_max_8(u16_surrogate_accum,
    756                                    simd_add_8(x16hi[pk], simd_const_8(0x20)));
    757                 /* The values FFFE and FFFF are excluded. */
    758                 u16_FFFE_FFFF = simd_eq_8(simd_and(x16hi[pk],
    759                                                    simd_or(x16lo[pk], simd_const_8(1))),
    760                                           simd_const_8(0xFF));
    761                 u16_FFFE_FFFF_accum = simd_or(u16_FFFE_FFFF_accum, u16_FFFE_FFFF);
    762         }
    763         u16_surrogate_accum = simd_eq_8(simd_or(u16_surrogate_accum, simd_const_8(0x07)),
    764                                         simd_const_8(0xFF));
    765         if (bitblock_has_bit(simd_or(u16_surrogate_accum, u16_FFFE_FFFF_accum)))
    766                 CharSetValidationError("UCS-2");
    767         }
    768 #endif
    769 #ifndef X16HILO_ACCESS
    770         printf("UCS_2_Lexer::Do_CharsetValidation not yet complete; assuming OK.\n");
    771 #endif
    772705};
    773706
    774707
    775708void UTF_32_Lexer::Do_CharsetValidation() {
    776 #ifdef X32BYTEPLEX_ACCESS
    777         int packs = (buffer_units - 1)/PACKSIZE + 1;
    778         BytePack u32hh_accum = simd_const_8(0);
    779         BytePack u32hl_accum = simd_const_8(0);
    780         BytePack u32_surrogate_accum = simd_const_8(0);
    781         BytePack u32_FFFE_FFFF_accum = simd_const_8(0);
    782         BytePack u32_BMP_select;
    783         BytePack u32l_FFFE_FFFF;
    784         for (int pk = 0; pk < packs; pk++) {
    785                 /* There can be no bits set in the high octet; "or" together
    786                    all octet values to check for any bit set. */
    787                 u32hh_accum = simd_or(u32hh_accum, x32hh[pk]);
    788                 /* The second octet has a max value of 0x10, corresponding to the
    789                    maximum Unicode code point value of 0x10FFFF.  Accumulate the
    790                    maximum of all u32hl values observed. */
    791                 u32hl_accum = simd_max_8(u32hl_accum, x32hl[pk]);
    792                 /* The third octet cannot be in the range D8-DF if the second octet
    793                    is 0.  This corresponds to the D800-DFFF range of illegal codepoints
    794                    reserved for UTF-16 surrogate pairs. Accumulate the results.
    795                    To check, 0x20 is added to each such octet, mapping the D8-DF
    796                    range to F8-FF and wrapping E0-FF values around.  The max value
    797                    is then accumulated.  */
    798                 u32_BMP_select = simd_eq_8(x32hl[pk], simd_const_8(0));
    799                 u32_surrogate_accum =
    800                         simd_max_8(u32_surrogate_accum,
    801                                    simd_and(u32_BMP_select,
    802                                             simd_add_8(x32lh[pk], simd_const_8(0x20))));
    803                 /* The low two octets cannot have the value FFFE or FFFF if
    804                    we're in the BMP (second octet is 0). */
    805                 u32l_FFFE_FFFF = simd_eq_8(simd_and(x32lh[pk],
    806                                                     simd_or(x32ll[pk], simd_const_8(1))),
    807                                            simd_const_8(0xFF));
    808                 u32_FFFE_FFFF_accum = simd_or(u32_FFFE_FFFF_accum,
    809                                               simd_and(u32_BMP_select, u32l_FFFE_FFFF));
    810         }
    811         u32hl_accum = simd_gt_8(u32hl_accum, simd_const_8(0x10));
    812         u32_surrogate_accum = simd_eq_8(simd_or(u32_surrogate_accum, simd_const_8(0x07)),
    813                                         simd_const_8(0xFF));
    814         if (bitblock_has_bit(simd_or(simd_or(u32hh_accum, u32hl_accum),
    815                                          simd_or(u32_surrogate_accum, u32_FFFE_FFFF_accum)))) {
    816                 CharSetValidationError("UTF-32");
    817         }
    818 #endif
    819 #ifndef X32BYTEPLEX_ACCESS
    820         printf("UTF_32_Lexer::Do_CharsetValidation not yet complete; assuming OK.\n");
    821 #endif
    822709};
    823710
Note: See TracChangeset for help on using the changeset viewer.