Changeset 4193


Ignore:
Timestamp:
Sep 26, 2014, 9:01:36 AM (5 years ago)
Author:
cameron
Message:

More UTF-8 validation

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4187 r4193  
    245245                }
    246246                c = (c << 6) | static_cast<unsigned>(*_cursor & 0x3F);
    247             }
    248         }
    249     }
     247                // It is an error if a 3-byte sequence is used to encode a codepoint < 0x800
     248                // or a 4-byte sequence is used to encode a codepoint < 0x10000.
     249                // if (((bytes == 1) && (c < 0x20)) || ((bytes == 2) && (c < 0x10))) {
     250                if ((c << (bytes - 1)) < 0x20) {
     251                    throw InvalidUTF8Encoding();
     252                }
     253                 
     254            }
     255        }
     256    }
     257    // It is an error if a 4-byte sequence is used to encode a codepoint
     258    // above the Unicode maximum.   
     259    if (c > 0x10FFFF) throw InvalidUTF8Encoding();
    250260    return c;
    251261}
     
    286296                if (start == _cursor) {
    287297                    negated = true;
    288                     start = ++_cursor; // move the start ahead incase the next character is a [ or -
     298                    start = ++_cursor; // move the start ahead in case the next character is a ] or -
    289299                    literal = false;                   
    290300                }
Note: See TracChangeset for help on using the changeset viewer.