Changeset 6181


Ignore:
Timestamp:
Oct 29, 2018, 8:32:47 AM (5 months ago)
Author:
cameron
Message:

Enabling Unicode Level 2 matching under canonical and compatible equivalence: -U2 flag

Location:
icGREP/icgrep-devel/icgrep/re
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp

    r6178 r6181  
    187187    std::vector<RE *> list;
    188188    unsigned i = 0;
     189    bool unchanged = true;
    189190    while (i < size) {
    190191        std::u32string stringPiece = getStringPiece(seq, i);
     
    192193            std::u32string s;
    193194            NFD_append(s, stringPiece);
     195            if (s != stringPiece) unchanged = false;
    194196            list.push_back(u32string2re(s));
    195197            i += stringPiece.size();
    196198        } else {
    197             list.push_back(transform((*seq)[i]));
     199            RE * r = (*seq)[i];
     200            RE * t = transform(r);
     201            if (t != r) unchanged = false;
     202            list.push_back(t);
    198203            i++;
    199204        }
    200205    }
     206    if (unchanged) return seq;
    201207    return makeSeq(list.begin(), list.end());
    202208}
  • icGREP/icgrep-devel/icgrep/re/Unicode/equivalence.cpp

    r6180 r6181  
    141141    std::vector<RE *> list;
    142142    unsigned i = 0;
     143    bool changed = false;
    143144    while (i < size) {
    144145        std::u32string stringPiece = getStringPiece(seq, i);
    145146        if (stringPiece.size() > 0) {
     147            RE * e = addEquivalents(stringPiece);
     148            if (Seq * t = dyn_cast<Seq>(e)) {
     149                unsigned tsize = t->size();
     150                if ((tsize != size) || (getStringPiece(t,0) != stringPiece)) changed = true;
     151            } else changed = true;
    146152            list.push_back(addEquivalents(stringPiece));
    147153            i += stringPiece.size();
    148154        } else {
    149             list.push_back(transform((*seq)[i]));
     155            RE * r = (*seq)[i];
     156            RE * t = transform(r);
     157            if (t != r) changed = true;
     158            list.push_back(t);
    150159            i++;
    151160        }
    152161    }
     162    if (!changed) return seq;
    153163    return makeSeq(list.begin(), list.end());
    154164}
     
    449459        }
    450460    }
    451     if (addedCC.empty()) return cc;
     461    if ((addedCC - *cc).empty()) return cc;
    452462    return makeCC(*cc + addedCC);
    453463}
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r6178 r6181  
    3737#include <re/grapheme_clusters.h>
    3838#include <re/validation.h>
     39#include <re/Unicode/decomposition.h>
     40#include <re/Unicode/equivalence.h>
    3941#include <llvm/Support/raw_ostream.h>
    4042#include <llvm/Support/ErrorHandling.h>
     
    6567                              CL_ENUM_VAL_SENTINEL), cl::cat(RegexOptions));
    6668
     69   
     70static cl::opt<bool> UnicodeLevel2("U2", cl::desc("Enable Unicode Level matching under canonical and compatible (?K) equivalence."), cl::cat(RegexOptions));
     71
    6772bool LLVM_READONLY PrintOptionIsSet(RE_PrintFlags flag) {
    6873    return PrintOptions.isSet(flag);
     
    8691    r = re::resolveUnicodeNames(r);
    8792    validateNamesDefined(r);
    88     r = resolveCaseInsensitiveMode(r, globallyCaseInsensitive);
     93    if (UnicodeLevel2 && validateAlphabet(&cc::Unicode, r)) {
     94        r = UCD::NFD_Transformer().transformRE(r);
     95        r = UCD::addClusterMatches(r);
     96        r = UCD::addEquivalentCodepoints(r);
     97    } else {
     98        r = resolveCaseInsensitiveMode(r, globallyCaseInsensitive);
     99    }
    89100    return r;
    90101}
     
    98109}
    99110
    100 RE * regular_expression_passes(RE * r) {
     111RE * regular_expression_passes(RE * re) {
    101112
    102113    //Optimization passes to simplify the AST.
    103     r = removeNullablePrefix(r);
     114    RE * r = removeNullablePrefix(re);
    104115    r = removeNullableSuffix(r);
    105116    r = RE_Star_Normal().transformRE(r);
Note: See TracChangeset for help on using the changeset viewer.