Ignore:
Timestamp:
Dec 18, 2017, 1:56:51 PM (16 months ago)
Author:
cameron
Message:

RE parser restructuring; parsing symbolic ranges, collation and equivalence exprs

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_parser_ere.cpp

    r5181 r5787  
    66
    77#include <re/re_parser_ere.h>
    8 #include <re/re_parser_helper.h>
     8#include <re/re_start.h>
     9#include <re/re_end.h>
     10#include <re/re_any.h>
     11#include <re/re_alt.h>
     12#include <re/re_seq.h>
    913
    10 namespace re{
     14namespace re {
    1115
    12     // \d and \D removed
    13     const uint64_t setEscapeCharacters = bit3C('b') | bit3C('p') | bit3C('q') | bit3C('w') | bit3C('s') | bit3C('<') | bit3C('>') |
    14                                          bit3C('B') | bit3C('P') | bit3C('Q') | bit3C('W') | bit3C('S') | bit3C('N') | bit3C('X');
    1516
    16     bool RE_Parser_ERE::isSetEscapeChar(char c) {
    17         return c >= 0x3C && c <= 0x7B && ((setEscapeCharacters >> (c - 0x3C)) & 1) == 1;
    18     }
     17RE * RE_Parser_ERE::parse_next_item() {
     18    if (mCursor.noMore() || atany("*?+{|")) return nullptr;
     19    else if ((mGroupsOpen > 0) && at(')')) return nullptr;
     20    else if (accept('^')) return makeStart();
     21    else if (accept('$')) return makeEnd();
     22    else if (accept('.')) return makeAny();
     23    else if (accept('(')) return parse_group();
     24    else if (accept('[')) return parse_bracket_expr();
     25    else if (accept('\\')) return parse_escaped();
     26    else return createCC(parse_literal_codepoint());
     27}
    1928
    20     inline bool RE_Parser_ERE::isUnsupportChartsetOperator(char c) {
    21         switch (c) {
    22             case '\\':
    23                 return true;
    24             default:
    25                 return false;
     29// A parenthesized group.  Input precondition: the opening ( has been consumed
     30RE * RE_Parser_ERE::parse_group() {
     31    // Capturing paren group.
     32    mGroupsOpen++;
     33    RE * captured = parse_alt();
     34    mCaptureGroupCount++;
     35    std::string captureName = "\\" + std::to_string(mCaptureGroupCount);
     36    Name * const capture  = mMemoizer.memoize(makeCapture(captureName, captured));
     37    auto key = std::make_pair("", captureName);
     38    mNameMap.insert(std::make_pair(std::move(key), capture));
     39    if (!accept(')')) ParseFailure("Closing parenthesis required.");
     40    mGroupsOpen--;
     41    return capture;
     42}
     43
     44RE * RE_Parser_ERE::parse_escaped() {
     45    if (accept('b')) return makeWordBoundary();
     46    if (accept('B')) return makeWordNonBoundary();
     47    if (accept('s')) return makeWhitespaceSet();
     48    if (accept('S')) return makeComplement(makeWhitespaceSet());
     49    if (accept('<')) return makeWordBegin();
     50    if (accept('>')) return makeWordEnd();
     51    if (isdigit(*mCursor)) {
     52        mCursor++;
     53        std::string backref = std::string(mCursor.pos()-2, mCursor.pos());
     54        auto key = std::make_pair("", backref);
     55        auto f = mNameMap.find(key);
     56        if (f != mNameMap.end()) {
     57            return makeReference(backref, f->second);
     58        }
     59        else {
     60            ParseFailure("Back reference " + backref + " without prior capture group.");
    2661        }
    2762    }
     63    else {
     64        return createCC(parse_literal_codepoint());
     65    }
    2866}
     67
     68
     69// Parsing items within a bracket expression.
     70// Items represent individual characters or sets of characters.
     71// Ranges may be formed by individual character items separated by '-'.
     72RE * RE_Parser_ERE::parse_bracket_expr () {
     73    bool negated = accept('^');
     74    std::vector<RE *> items;
     75    do {
     76        if (accept('[')) {
     77            if (accept('=')) items.push_back(parse_equivalence_class());
     78            else if (accept('.')) items.push_back(range_extend(parse_collation_element()));
     79            else if (accept(':')) items.push_back(parse_Posix_class());
     80            else items.push_back(parse_bracket_expr());
     81        } else {
     82            items.push_back(range_extend(makeCC(parse_literal_codepoint())));
     83        }
     84    } while (mCursor.more() && !at(']'));
     85    RE * t = makeAlt(items.begin(), items.end());
     86    if (!accept(']')) ParseFailure("Expecting ]");
     87    if (negated) return makeComplement(t);
     88    else return t;
     89}
     90}
Note: See TracChangeset for help on using the changeset viewer.