Changeset 5180


Ignore:
Timestamp:
Oct 4, 2016, 3:26:12 PM (12 months ago)
Author:
xwa163
Message:

Support BRE and ERE for regex syntax.

Location:
icGREP/icgrep-devel/icgrep
Files:
7 added
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5179 r5180  
    6464add_library(PabloADT ${PABLO_SRC})
    6565add_library(RegExpADT re/re_re.cpp re/re_cc.cpp re/re_rep.cpp re/re_diff.cpp re/re_intersect.cpp re/printer_re.cpp)
    66 add_library(RegExpCompiler re/re_parser.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/re_analysis.cpp re/re_toolchain.cpp re/re_name_resolve.cpp)
     66add_library(RegExpCompiler re/re_parser.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/re_analysis.cpp re/re_toolchain.cpp re/re_name_resolve.cpp re/re_parser_pcre.cpp re/re_parser_ere.cpp re/re_parser_bre.cpp)
    6767add_library(CCADT cc/cc_compiler.cpp utf8_encoder.cpp utf16_encoder.cpp UCD/CaseFolding_txt.cpp)
    6868add_library(UCDlib UCD/unicode_set.cpp UCD/ucd_compiler.cpp UCD/PropertyObjects.cpp UCD/resolve_properties.cpp UCD/UnicodeNameData.cpp)
  • icGREP/icgrep-devel/icgrep/icgrep.cpp

    r5167 r5180  
    3232static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
    3333                                       "These are standard grep options intended for compatibility with typical grep usage.");
    34 enum RE_Syntax {FixedStrings, BRE, ERE, PCRE};
    3534
    3635#ifdef FUTURE
    3736static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
    38 static cl::opt<RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
     37static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
    3938    cl::values(
    40         clEnumValN(FixedStrings, "F", "Fixed strings, separated by newlines"),
    41         clEnumValN(BRE, "G", "Posix basic regular expression (BRE) syntax"),
    42         clEnumValN(ERE, "E", "Posix extened regular expression (ERE) syntax"),
    43         clEnumValN(PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
    44                clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(PCRE));
     39        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
     40        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
     41        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
     42        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
     43               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
    4544#endif
    4645
     
    120119    re::RE * re_ast = nullptr;
    121120    for (unsigned i = 0; i < regexVector.size(); i++) {
     121#ifdef FUTURE
     122        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
     123#else
    122124        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
     125#endif
    123126        REs.push_back(re_ast);
    124127        allREs += regexVector[i] + "\n";
     
    181184// Filters out the command line strings that shouldn't be passed on to Grep
    182185bool isArgUnwantedForGrep(char *argument) {
     186#ifdef FUTURE
     187    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E"};
     188#else
    183189    std::vector<std::string> unwantedFlags = {"-n"};
    184 
    185     for (unsigned i = 0; i < inputFiles.size(); ++i){
     190#endif
     191
     192    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
    186193        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
    187194            return true;
     
    240247    }
    241248
     249#ifdef FUTURE
     250    switch (RegexpSyntax) {
     251        case re::RE_Syntax::BRE:
     252            grepArguments.append("\"-G\" ");
     253            break;
     254        case re::RE_Syntax::ERE:
     255            grepArguments.append("\"-E\" ");
     256            break;
     257        case re::RE_Syntax::PCRE:
     258            grepArguments.append("\"-P\" ");
     259            break;
     260        default:
     261            //TODO: handle fix string
     262            break;
     263    }
     264#endif
     265
    242266    std::string systemCall = argv[0];
    243267    systemCall.append(" ");
    244268    systemCall.append(icGrepArguments);
    245269    systemCall.append(" ");
     270#ifdef FUTURE
     271    systemCall.append(" | grep --color=always ");
     272#else
    246273    systemCall.append(" | grep --color=always -P ");
     274#endif
    247275    systemCall.append(grepArguments);
    248276
     
    296324    cl::ParseCommandLineOptions(argc, argv);
    297325#ifdef FUTURE
    298     if (RegexpSyntax != RE_Syntax::PCRE) {
    299         llvm::report_fatal_error("Sorry, only PCRE syntax is fully supported\n.");
     326    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
     327        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
    300328    }
    301329#endif
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5161 r5180  
    66
    77#include <re/re_parser.h>
     8#include <re/re_parser_helper.h>
     9#include <re/re_parser_pcre.h>
     10#include <re/re_parser_ere.h>
     11#include <re/re_parser_bre.h>
    812#include <re/re_name.h>
    913#include <re/re_alt.h>
     
    2327#include <algorithm>
    2428
    25 // It would probably be best to enforce that {}, [], () must always
    26 // be balanced.   But legacy syntax allows } and ] to occur as literals
    27 // in certain contexts (no opening { or [, or immediately after [ or [^ ).
    28 // Perhaps this define should become a parameter.
    29 #define LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED true
    30 #define LEGACY_UNESCAPED_HYPHEN_ALLOWED true
    31 
    32 
    33 
    34 
    3529namespace re {
    3630   
    3731
    38 RE * RE_Parser::parse(const std::string & regular_expression, ModeFlagSet initialFlags) {
    39     RE_Parser parser(regular_expression);
    40     parser.fModeFlagSet = initialFlags;
    41     parser.fNested = false;
    42     parser.fGraphemeBoundaryPending = false;
    43     parser.mCaptureGroupCount = 0;
    44     RE * re = parser.parse_RE();
     32RE * RE_Parser::parse(const std::string & regular_expression, ModeFlagSet initialFlags, RE_Syntax syntax) {
     33    std::unique_ptr<RE_Parser> parser = nullptr;
     34
     35    switch (syntax) {
     36        case RE_Syntax::PCRE:
     37            parser = llvm::make_unique<RE_Parser_PCRE>(regular_expression);
     38            break;
     39        case RE_Syntax::ERE:
     40            parser = llvm::make_unique<RE_Parser_ERE>(regular_expression);
     41            break;
     42        case RE_Syntax ::BRE:
     43            parser = llvm::make_unique<RE_Parser_BRE>(regular_expression);
     44            break;
     45        default:
     46            //TODO handle FixString
     47            ParseFailure("Unsupport RE syntax!");
     48            break;
     49    }
     50
     51
     52    parser->fModeFlagSet = initialFlags;
     53    parser->fNested = false;
     54    parser->fGraphemeBoundaryPending = false;
     55    parser->mCaptureGroupCount = 0;
     56    RE * re = parser->parse_RE();
    4557    if (re == nullptr) {
    4658        ParseFailure("An unexpected parsing error occurred!");
     
    5365    , fNested(false)
    5466    , fGraphemeBoundaryPending(false)
     67    , fSupportNonCaptureGroup(false)
    5568    , mCursor(regular_expression)
    5669    , mCaptureGroupCount(0)
     
    164177    const ModeFlagSet modeFlagSet = fModeFlagSet;
    165178    RE * group_expr = nullptr;
    166     if (*mCursor == '?') {
     179    if (*mCursor == '?' && fSupportNonCaptureGroup) {
    167180        switch (*++mCursor) {
    168181            case '#':  // comment
     
    302315inline std::pair<int, int> RE_Parser::parse_range_bound() {
    303316    int lower_bound = 0, upper_bound = 0;
    304     if (*++mCursor == ',') {
    305         ++mCursor;
    306     } else {
     317    if (*++mCursor != ',') {
    307318        lower_bound = parse_int();
    308319    }
     
    332343
    333344
    334 #define bit3C(x) (1ULL << ((x) - 0x3C))
    335345const uint64_t setEscapeCharacters = bit3C('b') | bit3C('p') | bit3C('q') | bit3C('d') | bit3C('w') | bit3C('s') | bit3C('<') | bit3C('>') |
    336346                                     bit3C('B') | bit3C('P') | bit3C('Q') | bit3C('D') | bit3C('W') | bit3C('S') | bit3C('N') | bit3C('X');
    337347
    338 inline bool isSetEscapeChar(char c) {
     348inline bool RE_Parser::isSetEscapeChar(char c) {
    339349    return c >= 0x3C && c <= 0x7B && ((setEscapeCharacters >> (c - 0x3C)) & 1) == 1;
    340350}
     
    582592}
    583593
     594inline bool RE_Parser::isUnsupportChartsetOperator(char c) {
     595    return false;
     596}
     597
    584598CharsetOperatorKind RE_Parser::getCharsetOperator() {
     599    if (isUnsupportChartsetOperator(*mCursor)) {
     600        return emptyOperator;
     601    }
    585602    switch (*mCursor) {
    586603        case '&':
     
    876893        default:
    877894            // Escaped letters should be reserved for special functions.
    878             if (((*mCursor >= 'A') && (*mCursor <= 'Z')) || ((*mCursor >= 'a') && (*mCursor <= 'z')))
    879                 ParseFailure("Undefined or unsupported escape sequence");
     895            if (((*mCursor >= 'A') && (*mCursor <= 'Z')) || ((*mCursor >= 'a') && (*mCursor <= 'z'))){
     896                //Escape unknow letter will be parse as normal letter
     897                return parse_utf8_codepoint();
     898                //ParseFailure("Undefined or unsupported escape sequence");
     899            }
    880900            else if ((*mCursor < 0x20) || (*mCursor >= 0x7F))
    881901                ParseFailure("Illegal escape sequence");
  • icGREP/icgrep-devel/icgrep/re/re_parser.h

    r5161 r5180  
    2121namespace re {
    2222
     23enum RE_Syntax {FixedStrings, BRE, ERE, PCRE};
    2324enum CharsetOperatorKind
    2425    {intersectOp, setDiffOp, ampChar, hyphenChar, rangeHyphen, posixPropertyOpener, setOpener, setCloser, backSlash, emptyOperator};
     
    4445public:
    4546
    46     static RE * parse(const std::string &input_string, ModeFlagSet initialFlags);
     47    static RE * parse(const std::string &input_string, ModeFlagSet initialFlags, RE_Syntax syntax = RE_Syntax::PCRE);
    4748
    4849   
     
    5152    }
    5253   
    53 private:
     54protected:
    5455    using NameMap = std::map<std::pair<std::string, std::string>, re::Name *>;
    5556
     
    116117    RE_Parser(const std::string & regular_expression, ModeFlagSet initialFlags);
    117118
    118     RE * parse_RE();
     119    virtual RE * parse_RE();
    119120
    120     RE * parse_alt();
     121    virtual RE * parse_alt();
    121122
    122123    RE * parse_seq();
    123124
    124     RE * parse_next_item();
     125    virtual RE * parse_next_item();
    125126
    126     RE * parse_group();
     127    virtual RE * parse_group();
    127128
    128     RE * extend_item(RE * re);
     129    virtual bool isSetEscapeChar(char c);
     130
     131    virtual RE * extend_item(RE * re);
    129132
    130133    RE * parseGraphemeBoundary(RE * re);
    131134
    132     std::pair<int, int> parse_range_bound();
     135    virtual std::pair<int, int> parse_range_bound();
    133136
    134137    unsigned parse_int();
    135138
    136     RE * parse_escaped();
     139    virtual RE * parse_escaped();
    137140
    138     RE * parseEscapedSet();
     141    virtual RE * parseEscapedSet();
    139142
    140143    codepoint_t parse_utf8_codepoint();
    141144
    142     RE * parsePropertyExpression();
     145    virtual RE * parsePropertyExpression();
    143146
    144147    Name * parseNamePatternExpression();
     
    157160    Name * createName(std::string && prop, std::string && value);
    158161
     162    virtual bool isUnsupportChartsetOperator(char c);
    159163    CharsetOperatorKind getCharsetOperator();
    160164
     
    163167    codepoint_t parse_codepoint();
    164168
    165     codepoint_t parse_escaped_codepoint();
     169    virtual codepoint_t parse_escaped_codepoint();
    166170
    167171    codepoint_t parse_hex_codepoint(int mindigits, int maxdigits);
     
    176180    static std::string canonicalize(const cursor_t begin, const cursor_t end);
    177181
    178 private:
     182protected:
    179183
    180184    ModeFlagSet                 fModeFlagSet;
    181185    bool                        fNested;
    182186    bool                        fGraphemeBoundaryPending;
     187    bool                        fSupportNonCaptureGroup;
    183188    Cursor                      mCursor;
    184189    unsigned                    mCaptureGroupCount;
Note: See TracChangeset for help on using the changeset viewer.