Ignore:
Timestamp:
Jun 23, 2015, 4:08:57 PM (4 years ago)
Author:
nmedfort
Message:

Temporary check-in

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/utf8_encoder.cpp

    r4614 r4615  
    3939
    4040RE * UTF8_Encoder::rangeToUTF8(const interval_t & item) {
    41     const auto min = lenUTF8(lo_codepoint(item));
    42     const auto max = lenUTF8(hi_codepoint(item));
     41    const auto min = length(lo_codepoint(item));
     42    const auto max = length(hi_codepoint(item));
    4343    if (min < max) {
    4444        const auto m = maxCodePoint(min);
    45         return makeAlt({rangeToUTF8(interval_t(lo_codepoint(item), m)), rangeToUTF8(interval_t(m + 1, hi_codepoint(item)))});
     45        return makeAlt({rangeToUTF8(std::make_pair(lo_codepoint(item), m)), rangeToUTF8(std::make_pair(m + 1, hi_codepoint(item)))});
    4646    }
    4747    else {
     
    5252RE * UTF8_Encoder::rangeToUTF8(const codepoint_t lo, const codepoint_t hi, const unsigned index, const unsigned max)
    5353{
    54     const codepoint_t hbyte = u8byte(hi, index);
    55     const codepoint_t lbyte = u8byte(lo, index);
     54    const codepoint_t hbyte = encodingByte(hi, index);
     55    const codepoint_t lbyte = encodingByte(lo, index);
    5656    if (index == max) {
    5757        return makeByteRange(lbyte, hbyte);
     
    7676}
    7777
    78 inline bool UTF8_Encoder::isUTF8Prefix(const codepoint_t cp) {
     78bool UTF8_Encoder::isPrefix(const codepoint_t cp) {
    7979    return (cp >= 0xC2) && (cp <= 0xF4);
    8080}
    8181
    82 inline codepoint_t UTF8_Encoder::u8byte(const codepoint_t cp, const unsigned n) {
     82codepoint_t UTF8_Encoder::encodingByte(const codepoint_t cp, const unsigned n) {
    8383    codepoint_t retVal = 0;
    84     const unsigned len = lenUTF8(cp);
     84    const unsigned len = length(cp);
    8585    if (n == 1) {
    8686        switch (len) {
     
    9797}
    9898
    99 inline unsigned UTF8_Encoder::lenUTF8(const codepoint_t cp) {
     99unsigned UTF8_Encoder::length(const codepoint_t cp) {
    100100    if (cp <= 0x7F) {
    101101        return 1;
     
    112112}
    113113
    114 inline codepoint_t UTF8_Encoder::maxCodePoint(const unsigned length) {
     114codepoint_t UTF8_Encoder::maxCodePoint(const unsigned length) {
    115115    if (length == 1) {
    116116        return 0x7F;
     
    128128}
    129129
    130 inline bool UTF8_Encoder::isLowCodePointAfterByte(const codepoint_t cp, const unsigned index) {
    131     const auto l = lenUTF8(cp);
    132     for (auto i = index; i != l; ++i) {
    133         if (u8byte(cp, i + 1) != 0x80) {
     130bool UTF8_Encoder::isLowCodePointAfterByte(const codepoint_t cp, const unsigned n) {
     131    const auto l = length(cp);
     132    for (auto i = n; i != l; ++i) {
     133        if (encodingByte(cp, i + 1) != 0x80) {
    134134            return false;
    135135        }
     
    138138}
    139139
    140 inline bool UTF8_Encoder::isHighCodePointAfterByte(const codepoint_t cp, const unsigned index) {
    141     const auto l = lenUTF8(cp);
    142     for (auto i = index; i != l; ++i) {
    143         if (u8byte(cp, i + 1) != 0xBF) {
     140bool UTF8_Encoder::isHighCodePointAfterByte(const codepoint_t cp, const unsigned n) {
     141    const auto l = length(cp);
     142    for (auto i = n; i != l; ++i) {
     143        if (encodingByte(cp, i + 1) != 0xBF) {
    144144            return false;
    145145        }
     
    148148}
    149149
     150codepoint_t UTF8_Encoder::minCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n) {
     151    const auto len = length(cp);
     152    const auto mask = (static_cast<codepoint_t>(1) << (len - n) * 6) - 1;
     153    const auto lo_cp = cp &~ mask;
     154    return (lo_cp == 0) ? mask + 1 : lo_cp;
     155}
     156
     157codepoint_t UTF8_Encoder::maxCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n) {
     158    const auto len = length(cp);
     159    const auto mask = (static_cast<codepoint_t>(1) << (len - n) * 6) - 1;
     160    return cp | mask;
     161}
    150162
    151163inline CC * UTF8_Encoder::makeByteRange(const codepoint_t lo, const codepoint_t hi) {
Note: See TracChangeset for help on using the changeset viewer.