Changeset 5554 for icGREP/icgrep-devel


Ignore:
Timestamp:
Jul 7, 2017, 2:27:56 PM (2 years ago)
Author:
cameron
Message:

-enable-byte-mode initial check-in

Location:
icGREP/icgrep-devel/icgrep
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/icgrep.cpp

    r5550 r5554  
    4141static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
    4242
     43static cl::opt<bool> ByteMode("enable-byte-mode", cl::desc("Process regular expressions in byte mode"));
     44
    4345static cl::opt<bool> MultiGrepKernels("enable-multigrep-kernels", cl::desc("Construct separated kernels for each regular expression"));
    4446static cl::opt<int> REsPerGroup("re-num", cl::desc("Number of regular expressions processed by each kernel."), cl::init(1));
     
    7577    std::vector<re::RE *> REs;
    7678    for (unsigned i = 0; i < grep::RegexpVector.size(); i++) {
    77         re::RE * re_ast = re::RE_Parser::parse(grep::RegexpVector[i], globalFlags, grep::RegexpSyntax);
     79        re::RE * re_ast = re::RE_Parser::parse(grep::RegexpVector[i], globalFlags, grep::RegexpSyntax, ByteMode);
    7880        REs.push_back(re_ast);
    7981    }
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5537 r5554  
    4242}
    4343
    44 RE * RE_Parser::parse(const std::string & regular_expression, ModeFlagSet initialFlags, RE_Syntax syntax) {
     44RE * RE_Parser::parse(const std::string & regular_expression, ModeFlagSet initialFlags, RE_Syntax syntax, bool ByteMode) {
    4545    std::unique_ptr<RE_Parser> parser = nullptr;
    4646    switch (syntax) {
     
    6262            break;
    6363    }
     64    parser->fByteMode = ByteMode;
    6465    parser->fModeFlagSet = initialFlags;
    6566    parser->fNested = false;
     
    7475
    7576RE_Parser::RE_Parser(const std::string & regular_expression)
    76 : fModeFlagSet(0)
     77: fByteMode(false)
     78, fModeFlagSet(0)
    7779, fNested(false)
    7880, fGraphemeBoundaryPending(false)
     
    164166            case ']':
    165167                if (LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) {
    166                     return createCC(parse_utf8_codepoint());
     168                    return createCC(parse_literal_codepoint());
    167169                }
    168170                ParseFailure("Use  \\] for literal ].");
     
    171173                    break;  //  a recursive invocation for a regexp in \N{...}
    172174                } else if (LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) {
    173                     return createCC(parse_utf8_codepoint());
     175                    return createCC(parse_literal_codepoint());
    174176                }
    175177                ParseFailure("Use \\} for literal }.");
     
    188190                return parse_escaped();
    189191            default:
    190                 re = createCC(parse_utf8_codepoint());
     192                re = createCC(parse_literal_codepoint());
    191193                if ((fModeFlagSet & ModeFlagType::GRAPHEME_CLUSTER_MODE) != 0) {
    192194                    fGraphemeBoundaryPending = true;
     
    534536void InvalidUTF8Encoding() {
    535537    RE_Parser::ParseFailure("Invalid UTF-8 encoding!");
     538}
     539
     540codepoint_t RE_Parser::parse_literal_codepoint() {
     541    if (fByteMode) {
     542       return static_cast<uint8_t>(*mCursor++);
     543    }
     544    else return parse_utf8_codepoint();
    536545}
    537546
     
    909918                break;
    910919            case emptyOperator:
    911                 lastCodepointItem = parse_utf8_codepoint();
     920                lastCodepointItem = parse_literal_codepoint();
    912921                insert(cc, lastCodepointItem);
    913922                lastItemKind = CodepointItem;
     
    924933        return parse_escaped_codepoint();
    925934    } else {
    926         return parse_utf8_codepoint();
     935        return parse_literal_codepoint();
    927936    }
    928937}
     
    10021011            // Escaped letters should be reserved for special functions.
    10031012            if (((*mCursor >= 'A') && (*mCursor <= 'Z')) || ((*mCursor >= 'a') && (*mCursor <= 'z'))){
    1004                 //Escape unknow letter will be parse as normal letter
    1005                 return parse_utf8_codepoint();
     1013                //Escape unknown letter will be parse as normal letter
     1014                return parse_literal_codepoint();
    10061015                //ParseFailure("Undefined or unsupported escape sequence");
    10071016            }
  • icGREP/icgrep-devel/icgrep/re/re_parser.h

    r5267 r5554  
    4040    static LLVM_ATTRIBUTE_NORETURN void ParseFailure(std::string errmsg);
    4141
    42     static RE * parse(const std::string &input_string, ModeFlagSet initialFlags, RE_Syntax syntax = RE_Syntax::PCRE);
     42    static RE * parse(const std::string &input_string, ModeFlagSet initialFlags, RE_Syntax syntax = RE_Syntax::PCRE, bool ByteMode = false);
    4343
    4444protected:
     
    136136    virtual RE * parseEscapedSet();
    137137
     138    codepoint_t parse_literal_codepoint();
     139   
    138140    codepoint_t parse_utf8_codepoint();
    139141
     
    180182
    181183protected:
    182 
     184    bool                        fByteMode;
    183185    ModeFlagSet                 fModeFlagSet;
    184186    bool                        fNested;
Note: See TracChangeset for help on using the changeset viewer.