Ignore:
Timestamp:
Jul 14, 2015, 10:46:10 PM (4 years ago)
Author:
nmedfort
Message:

Moved responsibility of handling 'special cases of Unicode TR #18' and 'compatibility properties of UTR #18 Annex C' into RE_Parser.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4660 r4671  
    1717#include <re/parsefailure.h>
    1818#include <UCD/CaseFolding_txt.h>
     19#include <sstream>
    1920#include <algorithm>
    2021
     
    366367    throw_incomplete_expression_error_if_end_of_stream();
    367368    if (isSetEscapeChar(*_cursor))
    368       return parse_escaped_set();
     369      return parseEscapedSet();
    369370    else
    370371      return build_CC(parse_escaped_codepoint());
    371372}
    372373
    373 RE * makeDigitSet() {
    374     return makeName("Nd", Name::Type::UnicodeProperty);
    375 }
    376 
    377 RE * makeWhitespaceSet() {
    378     return makeName("Whitespace", Name::Type::UnicodeProperty);
    379 }
    380 
    381 RE * makeWordSet() {
    382     return makeName("word", Name::Type::UnicodeProperty);
    383 }
    384 
    385 RE * makeComplement(RE * s) {
    386   return makeDiff(makeAny(), s);
    387 }
    388 
    389 RE * makeWordBoundary () {
    390     RE * wordC = makeWordSet();
    391     return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)}),
    392                     makeSeq({makeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)})});
    393 }
    394 
    395 RE * makeWordNonBoundary () {
    396     RE * wordC = makeWordSet();
    397     return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)}),
    398                     makeSeq({makeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)})});
    399 }
    400 
    401 RE * RE_Parser::parse_escaped_set() {
     374RE * RE_Parser::parseEscapedSet() {
    402375    bool complemented = false;
    403376    RE * s;
     
    433406            if (_cursor == _end || *_cursor != '{') throw ParseFailure("Malformed property expression");
    434407            ++_cursor;
    435             s = parse_property_expression();
     408            s = parsePropertyExpression();
    436409            if (_cursor == _end || *_cursor != '}') throw ParseFailure("Malformed property expression");
    437410            ++_cursor;
    438             if (complemented) return makeComplement(s);
    439             else return s;
     411            return complemented ? makeComplement(s) : s;
    440412        default:
    441413            throw ParseFailure("Internal error");
     
    487459}
    488460
    489 Name * RE_Parser::parse_property_expression() {
     461std::string RE_Parser::canonicalize(const cursor_t begin, const cursor_t end) {
     462    std::locale loc;
     463    std::stringstream s;
     464    for (auto i = begin; i != end; ++i) {
     465        switch (*i) {
     466            case '_': case ' ': case '-':
     467                break;
     468            default:
     469                s << std::tolower(*i, loc);
     470        }
     471    }
     472    return s.str();
     473}
     474
     475Name * RE_Parser::parsePropertyExpression() {
    490476    const cursor_t start = _cursor;
    491477    while (_cursor != _end && *_cursor != '}' and *_cursor != ':' and *_cursor != '=') {
     
    500486        }
    501487        // We have a property-name = value expression
    502         return makeName(std::string(start, prop_end), std::string(val_start, _cursor), Name::Type::UnicodeProperty);
    503     }
    504     return makeName(std::string(start, _cursor), Name::Type::UnicodeProperty);
     488        return resolvePropertyExpression(canonicalize(start, prop_end), canonicalize(val_start, _cursor));
     489    }
     490    return resolvePropertyExpression(canonicalize(start, _cursor));
     491}
     492
     493Name * RE_Parser::resolvePropertyExpression(std::string value) {
     494
     495    auto key = std::make_pair("", value);
     496    auto f = mNameMap.find(key);
     497    if (f != mNameMap.end()) {
     498        return f->second;
     499    }
     500
     501    Name * property = makeName(value, Name::Type::UnicodeProperty);
     502
     503    // Try special cases of Unicode TR #18
     504    if (value == "any") {
     505        property->setDefinition(makeAny());
     506    }
     507    else if (value == "ascii") {
     508        property->setDefinition(resolvePropertyExpression("blk", "ascii"));
     509    }
     510    else if (value == "assigned") {
     511        Name * unassigned = resolvePropertyExpression("cn");
     512        property->setDefinition(makeDiff(makeAny(), unassigned));
     513    }
     514    // Now compatibility properties of UTR #18 Annex C
     515    else if (value == "xdigit") {
     516        Name * digit = resolvePropertyExpression("nd");
     517        Name * hexdigit = resolvePropertyExpression("hexdigit");
     518        property->setDefinition(makeAlt({digit, hexdigit}));
     519    }
     520    else if (value == "alnum") {
     521        Name * digit = resolvePropertyExpression("nd");
     522        Name * alpha = resolvePropertyExpression("alphabetic");
     523        property->setDefinition(makeAlt({digit, alpha}));
     524    }
     525    else if (value == "blank") {
     526        Name * space_sep = resolvePropertyExpression("space_separator");
     527        CC * tab = makeCC(0x09);
     528        property->setDefinition(makeAlt({space_sep, tab}));
     529    }
     530    else if (value == "graph") {
     531        Name * space = resolvePropertyExpression("space");
     532        Name * ctrl = resolvePropertyExpression("control");
     533        Name * surr = resolvePropertyExpression("surrogate");
     534        Name * unassigned = resolvePropertyExpression("cn");
     535        property->setDefinition(makeDiff(makeAny(), makeAlt({space, ctrl, surr, unassigned})));
     536    }
     537    else if (value == "print") {
     538        Name * graph = resolvePropertyExpression("graph");
     539        Name * space_sep = resolvePropertyExpression("space_separator");
     540        property->setDefinition(makeAlt({graph, space_sep}));
     541    }
     542    else if (value == "word") {
     543        Name * alnum = resolvePropertyExpression("alnum");
     544        Name * mark = resolvePropertyExpression("mark");
     545        Name * conn = resolvePropertyExpression("connectorpunctuation");
     546        Name * join = resolvePropertyExpression("joincontrol");
     547        property->setDefinition(makeAlt({alnum, mark, conn, join}));
     548    }
     549
     550    mNameMap.emplace(std::move(key), property);
     551
     552    return property;
     553}
     554
     555Name * RE_Parser::resolvePropertyExpression(std::string namespaceValue, std::string nameValue) {
     556
     557    auto key = std::make_pair(namespaceValue, nameValue);
     558
     559    auto f = mNameMap.find(key);
     560    if (f != mNameMap.end()) {
     561        return f->second;
     562    }
     563
     564
     565
     566    Name * property = makeName(namespaceValue, nameValue, Name::Type::UnicodeProperty);
     567
     568    mNameMap.emplace(std::move(key), property);
     569
     570    return property;
    505571}
    506572
     
    675741                        _cursor++;
    676742                    }
    677                     RE * posixSet = parse_property_expression();
     743                    RE * posixSet = parsePropertyExpression();
    678744                    if (negated) posixSet = makeComplement(posixSet);
    679745                    subexprs.push_back(posixSet);
     
    704770                throw_incomplete_expression_error_if_end_of_stream();
    705771                if (isSetEscapeChar(*_cursor)) {
    706                     subexprs.push_back(parse_escaped_set());
     772                    subexprs.push_back(parseEscapedSet());
    707773                    lastItemKind = SetItem;
    708774                }
     
    886952}
    887953   
    888    
    889 }
     954RE * RE_Parser::makeComplement(RE * s) {
     955  return makeDiff(makeAny(), s);
     956}
     957
     958RE * RE_Parser::makeWordBoundary () {
     959    RE * wordC = makeWordSet();
     960    return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)}),
     961                    makeSeq({makeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)})});
     962}
     963
     964RE * RE_Parser::makeWordNonBoundary () {
     965    RE * wordC = makeWordSet();
     966    return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)}),
     967                    makeSeq({makeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)})});
     968}
     969
     970inline Name * RE_Parser::makeDigitSet() {
     971    return resolvePropertyExpression("nd");
     972}
     973
     974inline Name * RE_Parser::makeAlphaNumeric() {
     975    return resolvePropertyExpression("alnum");
     976}
     977
     978inline Name * RE_Parser::makeWhitespaceSet() {
     979    return resolvePropertyExpression("whitespace");
     980}
     981
     982inline Name * RE_Parser::makeWordSet() {
     983    return resolvePropertyExpression("word");
     984}
     985
     986}
Note: See TracChangeset for help on using the changeset viewer.