Changeset 4615 for icGREP/icgrep-devel


Ignore:
Timestamp:
Jun 23, 2015, 4:08:57 PM (4 years ago)
Author:
nmedfort
Message:

Temporary check-in

Location:
icGREP/icgrep-devel/icgrep
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/CaseFolding_txt.cpp

    r4612 r4615  
    99#include <algorithm>
    1010
    11 int findFoldEntry(codepoint_t cp) {
     11using namespace re;
     12
     13int findFoldEntry(const codepoint_t cp) {
    1214    int lo = 0;
    1315    int hi = foldTableSize;
     
    2426}
    2527
    26 void caseInsensitiveInsert(re::CC * cc, codepoint_t cp) {
    27     caseInsensitiveInsertRange(cc, cp, cp);
    28 }
    29 
    30 void caseInsensitiveInsertRange(re::CC * cc, codepoint_t lo, codepoint_t hi) {
     28void caseInsensitiveInsertRange(CC * cc, const codepoint_t lo, const codepoint_t hi) {
    3129    cc->insert_range(lo, hi);
    3230    // Find the first foldTable entry overlapping the (lo, hi) range.
  • icGREP/icgrep-devel/icgrep/UCD/CaseFolding_txt.h

    r4316 r4615  
    1414#include "re/re_cc.h"
    1515
    16 typedef unsigned codepoint_t;
    17 
    1816struct FoldEntry {
    19     codepoint_t range_lo;
     17    re::codepoint_t range_lo;
    2018    int fold_offset;
    21     std::vector<std::pair<codepoint_t, codepoint_t> > fold_pairs;
     19    std::vector<std::pair<re::codepoint_t, re::codepoint_t> > fold_pairs;
    2220};
    2321
    24 void caseInsensitiveInsert(re::CC * cc, codepoint_t cp);
    25 
    26 void caseInsensitiveInsertRange(re::CC * cc, codepoint_t lo, codepoint_t hi);
    27    
     22void caseInsensitiveInsertRange(re::CC * cc, const re::codepoint_t lo, const re::codepoint_t hi);
     23
     24inline void caseInsensitiveInsert(re::CC * cc, const re::codepoint_t cp) {
     25    caseInsensitiveInsertRange(cc, cp, cp);
     26}
    2827
    2928const int foldTableSize = 216;
  • icGREP/icgrep-devel/icgrep/utf8_encoder.cpp

    r4614 r4615  
    3939
    4040RE * UTF8_Encoder::rangeToUTF8(const interval_t & item) {
    41     const auto min = lenUTF8(lo_codepoint(item));
    42     const auto max = lenUTF8(hi_codepoint(item));
     41    const auto min = length(lo_codepoint(item));
     42    const auto max = length(hi_codepoint(item));
    4343    if (min < max) {
    4444        const auto m = maxCodePoint(min);
    45         return makeAlt({rangeToUTF8(interval_t(lo_codepoint(item), m)), rangeToUTF8(interval_t(m + 1, hi_codepoint(item)))});
     45        return makeAlt({rangeToUTF8(std::make_pair(lo_codepoint(item), m)), rangeToUTF8(std::make_pair(m + 1, hi_codepoint(item)))});
    4646    }
    4747    else {
     
    5252RE * UTF8_Encoder::rangeToUTF8(const codepoint_t lo, const codepoint_t hi, const unsigned index, const unsigned max)
    5353{
    54     const codepoint_t hbyte = u8byte(hi, index);
    55     const codepoint_t lbyte = u8byte(lo, index);
     54    const codepoint_t hbyte = encodingByte(hi, index);
     55    const codepoint_t lbyte = encodingByte(lo, index);
    5656    if (index == max) {
    5757        return makeByteRange(lbyte, hbyte);
     
    7676}
    7777
    78 inline bool UTF8_Encoder::isUTF8Prefix(const codepoint_t cp) {
     78bool UTF8_Encoder::isPrefix(const codepoint_t cp) {
    7979    return (cp >= 0xC2) && (cp <= 0xF4);
    8080}
    8181
    82 inline codepoint_t UTF8_Encoder::u8byte(const codepoint_t cp, const unsigned n) {
     82codepoint_t UTF8_Encoder::encodingByte(const codepoint_t cp, const unsigned n) {
    8383    codepoint_t retVal = 0;
    84     const unsigned len = lenUTF8(cp);
     84    const unsigned len = length(cp);
    8585    if (n == 1) {
    8686        switch (len) {
     
    9797}
    9898
    99 inline unsigned UTF8_Encoder::lenUTF8(const codepoint_t cp) {
     99unsigned UTF8_Encoder::length(const codepoint_t cp) {
    100100    if (cp <= 0x7F) {
    101101        return 1;
     
    112112}
    113113
    114 inline codepoint_t UTF8_Encoder::maxCodePoint(const unsigned length) {
     114codepoint_t UTF8_Encoder::maxCodePoint(const unsigned length) {
    115115    if (length == 1) {
    116116        return 0x7F;
     
    128128}
    129129
    130 inline bool UTF8_Encoder::isLowCodePointAfterByte(const codepoint_t cp, const unsigned index) {
    131     const auto l = lenUTF8(cp);
    132     for (auto i = index; i != l; ++i) {
    133         if (u8byte(cp, i + 1) != 0x80) {
     130bool UTF8_Encoder::isLowCodePointAfterByte(const codepoint_t cp, const unsigned n) {
     131    const auto l = length(cp);
     132    for (auto i = n; i != l; ++i) {
     133        if (encodingByte(cp, i + 1) != 0x80) {
    134134            return false;
    135135        }
     
    138138}
    139139
    140 inline bool UTF8_Encoder::isHighCodePointAfterByte(const codepoint_t cp, const unsigned index) {
    141     const auto l = lenUTF8(cp);
    142     for (auto i = index; i != l; ++i) {
    143         if (u8byte(cp, i + 1) != 0xBF) {
     140bool UTF8_Encoder::isHighCodePointAfterByte(const codepoint_t cp, const unsigned n) {
     141    const auto l = length(cp);
     142    for (auto i = n; i != l; ++i) {
     143        if (encodingByte(cp, i + 1) != 0xBF) {
    144144            return false;
    145145        }
     
    148148}
    149149
     150codepoint_t UTF8_Encoder::minCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n) {
     151    const auto len = length(cp);
     152    const auto mask = (static_cast<codepoint_t>(1) << (len - n) * 6) - 1;
     153    const auto lo_cp = cp &~ mask;
     154    return (lo_cp == 0) ? mask + 1 : lo_cp;
     155}
     156
     157codepoint_t UTF8_Encoder::maxCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n) {
     158    const auto len = length(cp);
     159    const auto mask = (static_cast<codepoint_t>(1) << (len - n) * 6) - 1;
     160    return cp | mask;
     161}
    150162
    151163inline CC * UTF8_Encoder::makeByteRange(const codepoint_t lo, const codepoint_t hi) {
  • icGREP/icgrep-devel/icgrep/utf8_encoder.h

    r4614 r4615  
    2020    static re::RE * toUTF8(CC_NameMap & nameMap, re::RE * ast);
    2121
    22     static bool isUTF8Prefix(const re::codepoint_t cp);
    23     static unsigned lenUTF8(const re::codepoint_t cp);
     22    static bool isPrefix(const re::codepoint_t cp);
     23    static unsigned length(const re::codepoint_t cp);
    2424    static re::codepoint_t maxCodePoint(const unsigned length);
    25     static re::codepoint_t u8byte(const re::codepoint_t cp, const unsigned n);
    26     static bool isLowCodePointAfterByte(const re::codepoint_t cp, const unsigned index);
    27     static bool isHighCodePointAfterByte(const re::codepoint_t cp, const unsigned index);
     25    static re::codepoint_t encodingByte(const re::codepoint_t cp, const unsigned n);
     26    static bool isLowCodePointAfterByte(const re::codepoint_t cp, const unsigned n);
     27    static bool isHighCodePointAfterByte(const re::codepoint_t cp, const unsigned n);
     28    static re::codepoint_t minCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n);
     29    static re::codepoint_t maxCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n);
    2830private:
    2931    static re::RE * rangeToUTF8(const re::interval_t & item);
Note: See TracChangeset for help on using the changeset viewer.