Ignore:
Timestamp:
Oct 4, 2016, 3:26:12 PM (3 years ago)
Author:
xwa163
Message:

Support BRE and ERE for regex syntax.

Location:
icGREP/icgrep-devel/icgrep/re
Files:
7 added
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5161 r5180  
    66
    77#include <re/re_parser.h>
     8#include <re/re_parser_helper.h>
     9#include <re/re_parser_pcre.h>
     10#include <re/re_parser_ere.h>
     11#include <re/re_parser_bre.h>
    812#include <re/re_name.h>
    913#include <re/re_alt.h>
     
    2327#include <algorithm>
    2428
    25 // It would probably be best to enforce that {}, [], () must always
    26 // be balanced.   But legacy syntax allows } and ] to occur as literals
    27 // in certain contexts (no opening { or [, or immediately after [ or [^ ).
    28 // Perhaps this define should become a parameter.
    29 #define LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED true
    30 #define LEGACY_UNESCAPED_HYPHEN_ALLOWED true
    31 
    32 
    33 
    34 
    3529namespace re {
    3630   
    3731
    38 RE * RE_Parser::parse(const std::string & regular_expression, ModeFlagSet initialFlags) {
    39     RE_Parser parser(regular_expression);
    40     parser.fModeFlagSet = initialFlags;
    41     parser.fNested = false;
    42     parser.fGraphemeBoundaryPending = false;
    43     parser.mCaptureGroupCount = 0;
    44     RE * re = parser.parse_RE();
     32RE * RE_Parser::parse(const std::string & regular_expression, ModeFlagSet initialFlags, RE_Syntax syntax) {
     33    std::unique_ptr<RE_Parser> parser = nullptr;
     34
     35    switch (syntax) {
     36        case RE_Syntax::PCRE:
     37            parser = llvm::make_unique<RE_Parser_PCRE>(regular_expression);
     38            break;
     39        case RE_Syntax::ERE:
     40            parser = llvm::make_unique<RE_Parser_ERE>(regular_expression);
     41            break;
     42        case RE_Syntax ::BRE:
     43            parser = llvm::make_unique<RE_Parser_BRE>(regular_expression);
     44            break;
     45        default:
     46            //TODO handle FixString
     47            ParseFailure("Unsupport RE syntax!");
     48            break;
     49    }
     50
     51
     52    parser->fModeFlagSet = initialFlags;
     53    parser->fNested = false;
     54    parser->fGraphemeBoundaryPending = false;
     55    parser->mCaptureGroupCount = 0;
     56    RE * re = parser->parse_RE();
    4557    if (re == nullptr) {
    4658        ParseFailure("An unexpected parsing error occurred!");
     
    5365    , fNested(false)
    5466    , fGraphemeBoundaryPending(false)
     67    , fSupportNonCaptureGroup(false)
    5568    , mCursor(regular_expression)
    5669    , mCaptureGroupCount(0)
     
    164177    const ModeFlagSet modeFlagSet = fModeFlagSet;
    165178    RE * group_expr = nullptr;
    166     if (*mCursor == '?') {
     179    if (*mCursor == '?' && fSupportNonCaptureGroup) {
    167180        switch (*++mCursor) {
    168181            case '#':  // comment
     
    302315inline std::pair<int, int> RE_Parser::parse_range_bound() {
    303316    int lower_bound = 0, upper_bound = 0;
    304     if (*++mCursor == ',') {
    305         ++mCursor;
    306     } else {
     317    if (*++mCursor != ',') {
    307318        lower_bound = parse_int();
    308319    }
     
    332343
    333344
    334 #define bit3C(x) (1ULL << ((x) - 0x3C))
    335345const uint64_t setEscapeCharacters = bit3C('b') | bit3C('p') | bit3C('q') | bit3C('d') | bit3C('w') | bit3C('s') | bit3C('<') | bit3C('>') |
    336346                                     bit3C('B') | bit3C('P') | bit3C('Q') | bit3C('D') | bit3C('W') | bit3C('S') | bit3C('N') | bit3C('X');
    337347
    338 inline bool isSetEscapeChar(char c) {
     348inline bool RE_Parser::isSetEscapeChar(char c) {
    339349    return c >= 0x3C && c <= 0x7B && ((setEscapeCharacters >> (c - 0x3C)) & 1) == 1;
    340350}
     
    582592}
    583593
     594inline bool RE_Parser::isUnsupportChartsetOperator(char c) {
     595    return false;
     596}
     597
    584598CharsetOperatorKind RE_Parser::getCharsetOperator() {
     599    if (isUnsupportChartsetOperator(*mCursor)) {
     600        return emptyOperator;
     601    }
    585602    switch (*mCursor) {
    586603        case '&':
     
    876893        default:
    877894            // Escaped letters should be reserved for special functions.
    878             if (((*mCursor >= 'A') && (*mCursor <= 'Z')) || ((*mCursor >= 'a') && (*mCursor <= 'z')))
    879                 ParseFailure("Undefined or unsupported escape sequence");
     895            if (((*mCursor >= 'A') && (*mCursor <= 'Z')) || ((*mCursor >= 'a') && (*mCursor <= 'z'))){
     896                //Escape unknow letter will be parse as normal letter
     897                return parse_utf8_codepoint();
     898                //ParseFailure("Undefined or unsupported escape sequence");
     899            }
    880900            else if ((*mCursor < 0x20) || (*mCursor >= 0x7F))
    881901                ParseFailure("Illegal escape sequence");
  • icGREP/icgrep-devel/icgrep/re/re_parser.h

    r5161 r5180  
    2121namespace re {
    2222
     23enum RE_Syntax {FixedStrings, BRE, ERE, PCRE};
    2324enum CharsetOperatorKind
    2425    {intersectOp, setDiffOp, ampChar, hyphenChar, rangeHyphen, posixPropertyOpener, setOpener, setCloser, backSlash, emptyOperator};
     
    4445public:
    4546
    46     static RE * parse(const std::string &input_string, ModeFlagSet initialFlags);
     47    static RE * parse(const std::string &input_string, ModeFlagSet initialFlags, RE_Syntax syntax = RE_Syntax::PCRE);
    4748
    4849   
     
    5152    }
    5253   
    53 private:
     54protected:
    5455    using NameMap = std::map<std::pair<std::string, std::string>, re::Name *>;
    5556
     
    116117    RE_Parser(const std::string & regular_expression, ModeFlagSet initialFlags);
    117118
    118     RE * parse_RE();
     119    virtual RE * parse_RE();
    119120
    120     RE * parse_alt();
     121    virtual RE * parse_alt();
    121122
    122123    RE * parse_seq();
    123124
    124     RE * parse_next_item();
     125    virtual RE * parse_next_item();
    125126
    126     RE * parse_group();
     127    virtual RE * parse_group();
    127128
    128     RE * extend_item(RE * re);
     129    virtual bool isSetEscapeChar(char c);
     130
     131    virtual RE * extend_item(RE * re);
    129132
    130133    RE * parseGraphemeBoundary(RE * re);
    131134
    132     std::pair<int, int> parse_range_bound();
     135    virtual std::pair<int, int> parse_range_bound();
    133136
    134137    unsigned parse_int();
    135138
    136     RE * parse_escaped();
     139    virtual RE * parse_escaped();
    137140
    138     RE * parseEscapedSet();
     141    virtual RE * parseEscapedSet();
    139142
    140143    codepoint_t parse_utf8_codepoint();
    141144
    142     RE * parsePropertyExpression();
     145    virtual RE * parsePropertyExpression();
    143146
    144147    Name * parseNamePatternExpression();
     
    157160    Name * createName(std::string && prop, std::string && value);
    158161
     162    virtual bool isUnsupportChartsetOperator(char c);
    159163    CharsetOperatorKind getCharsetOperator();
    160164
     
    163167    codepoint_t parse_codepoint();
    164168
    165     codepoint_t parse_escaped_codepoint();
     169    virtual codepoint_t parse_escaped_codepoint();
    166170
    167171    codepoint_t parse_hex_codepoint(int mindigits, int maxdigits);
     
    176180    static std::string canonicalize(const cursor_t begin, const cursor_t end);
    177181
    178 private:
     182protected:
    179183
    180184    ModeFlagSet                 fModeFlagSet;
    181185    bool                        fNested;
    182186    bool                        fGraphemeBoundaryPending;
     187    bool                        fSupportNonCaptureGroup;
    183188    Cursor                      mCursor;
    184189    unsigned                    mCaptureGroupCount;
Note: See TracChangeset for help on using the changeset viewer.