Changeset 4333 for icGREP


Ignore:
Timestamp:
Dec 11, 2014, 6:28:53 AM (5 years ago)
Author:
cameron
Message:

Clean up UTF-8 parser

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4332 r4333  
    421421codepoint_t RE_Parser::parse_utf8_codepoint() {
    422422    // Must cast to unsigned char to avoid sign extension.
    423     unsigned char c = static_cast<unsigned char>(*_cursor++);
    424     codepoint_t cp = c;
    425     if (c > 0x80) { // if non-ascii
    426         if (c < 0xC2) {
     423    unsigned char pfx = static_cast<unsigned char>(*_cursor++);
     424    codepoint_t cp = pfx;
     425    if (pfx < 0x80) return cp;
     426    unsigned suffix_bytes;
     427    if (pfx < 0xE0) {
     428        if (pfx < 0xC2) {  // bare suffix or illegal prefix 0xC0 or 0xC2
    427429            throw InvalidUTF8Encoding();
    428430        }
    429         else { // [0xC2, 0xFF]
    430             unsigned bytes = 0;
    431             if (c < 0xE0) { // [0xC2, 0xDF]
    432                 cp &= 0x1F;
    433                 bytes = 1;
    434             }
    435             else if (c < 0xF0) { // [0xE0, 0xEF]
    436                 cp &= 0x0F;
    437                 bytes = 2;
    438             }
    439             else { // [0xF0, 0xFF]
    440                 cp &= 0x0F;
    441                 bytes = 3;
    442             }
    443             while (bytes--) {
    444                 if (_cursor == _end) {
    445                     throw InvalidUTF8Encoding();
    446                 }
    447                 c = static_cast<unsigned char>(*_cursor++);
    448                 if ((c & 0xC0) != 0x80) {
    449                     throw InvalidUTF8Encoding();
    450                 }
    451                 cp = (cp << 6) | (c & 0x3F);
    452                 // It is an error if a 3-byte sequence is used to encode a codepoint < 0x800
    453                 // or a 4-byte sequence is used to encode a codepoint < 0x10000.
    454                 if (((bytes == 1) && (cp < 0x20)) || ((bytes == 2) && (cp < 0x10))) {
    455                     throw InvalidUTF8Encoding();
    456                 }
    457             }
    458         }
     431        suffix_bytes = 1;
     432        cp &= 0x1F;
     433    }
     434    else if (pfx < 0xF0) { // [0xE0, 0xEF]
     435        cp &= 0x0F;
     436        suffix_bytes = 2;
     437    }
     438    else { // [0xF0, 0xFF]
     439        cp &= 0x0F;
     440        suffix_bytes = 3;
     441    }
     442    while (suffix_bytes--) {
     443        if (_cursor == _end) {
     444            throw InvalidUTF8Encoding();
     445        }
     446        unsigned char sfx = static_cast<unsigned char>(*_cursor++);
     447        if ((sfx & 0xC0) != 0x80) {
     448            throw InvalidUTF8Encoding();
     449        }
     450        cp = (cp << 6) | (sfx & 0x3F);
     451    }
     452    // It is an error if a 3-byte sequence is used to encode a codepoint < 0x800
     453    // or a 4-byte sequence is used to encode a codepoint < 0x10000.
     454    if ((pfx == 0xE0 && cp < 0x800) || (pfx == 0xF0 && cp < 0x10000)) {
     455        throw InvalidUTF8Encoding();
    459456    }
    460457    // It is an error if a 4-byte sequence is used to encode a codepoint
Note: See TracChangeset for help on using the changeset viewer.