Changeset 5789
- Timestamp:
- Dec 19, 2017, 12:17:52 PM (14 months ago)
- Location:
- icGREP/icgrep-devel/icgrep/re
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
icGREP/icgrep-devel/icgrep/re/re_parser.cpp
r5787 r5789 76 76 , fNested(false) 77 77 , mGroupsOpen(0) 78 , fSupportNonCaptureGroup(false)79 78 , mCursor(regular_expression) 80 79 , mCaptureGroupCount(0) … … 151 150 else return createCC(parse_literal_codepoint()); 152 151 } 153 152 153 154 RE * RE_Parser::parse_mode_group(bool & closing_paren_parsed) { 155 const ModeFlagSet savedModeFlagSet = fModeFlagSet; 156 while (mCursor.more() && !atany(":)")) { 157 bool negateMode = accept('-'); 158 ModeFlagType modeBit; 159 switch (*mCursor) { 160 case 'i': modeBit = CASE_INSENSITIVE_MODE_FLAG; break; 161 case 'g': modeBit = GRAPHEME_CLUSTER_MODE; break; 162 case 'm': modeBit = MULTILINE_MODE_FLAG; break; 163 //case 's': modeBit = DOTALL_MODE_FLAG; break; 164 case 'x': modeBit = IGNORE_SPACE_MODE_FLAG; break; 165 case 'd': modeBit = UNIX_LINES_MODE_FLAG; break; 166 default: ParseFailure("Unsupported mode flag."); 167 } 168 ++mCursor; 169 if (negateMode) { 170 fModeFlagSet &= ~modeBit; 171 negateMode = false; // for next flag 172 } else { 173 fModeFlagSet |= modeBit; 174 } 175 } 176 if (accept(':')) { 177 RE * group_expr = parse_alt(); 178 auto changed = fModeFlagSet ^ savedModeFlagSet; 179 if ((changed & CASE_INSENSITIVE_MODE_FLAG) != 0) { 180 group_expr = makeGroup(Group::Mode::CaseInsensitiveMode, group_expr, 181 (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) == 0 ? Group::Sense::Off : Group::Sense::On); 182 } 183 if ((changed & GRAPHEME_CLUSTER_MODE) != 0) { 184 group_expr = makeGroup(Group::Mode::GraphemeMode, group_expr, 185 (fModeFlagSet & GRAPHEME_CLUSTER_MODE) == 0 ? Group::Sense::Off : Group::Sense::On); 186 } 187 fModeFlagSet = savedModeFlagSet; 188 closing_paren_parsed = false; 189 return group_expr; 190 } else { // if *_cursor == ')' 191 require(')'); 192 closing_paren_parsed = true; 193 auto changed = fModeFlagSet ^ savedModeFlagSet; 194 if ((changed & (CASE_INSENSITIVE_MODE_FLAG|GRAPHEME_CLUSTER_MODE)) != 0) { 195 RE * group_expr = parse_seq(); 196 if ((changed & CASE_INSENSITIVE_MODE_FLAG) != 0) { 197 group_expr = makeGroup(Group::Mode::CaseInsensitiveMode, group_expr, 198 (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) == 0 ? Group::Sense::Off : Group::Sense::On); 199 } 200 if ((changed & GRAPHEME_CLUSTER_MODE) != 0) { 201 group_expr = makeGroup(Group::Mode::GraphemeMode, group_expr, 202 (fModeFlagSet & GRAPHEME_CLUSTER_MODE) == 0 ? Group::Sense::Off : Group::Sense::On); 203 } 204 return group_expr; 205 } 206 else return makeSeq(); 207 } 208 209 } 154 210 155 211 // Parse some kind of parenthesized group. Input precondition: mCursor 156 212 // after the ( 157 213 RE * RE_Parser::parse_group() { 158 const ModeFlagSet savedModeFlagSet = fModeFlagSet;159 214 mGroupsOpen++; 160 215 RE * group_expr = nullptr; 161 if (*mCursor == '?' && fSupportNonCaptureGroup) { 162 switch (*++mCursor) { 163 case '#': // comment 164 while (*++mCursor != ')'); 165 ++mCursor; 166 return parse_next_item(); 167 case ':': // Non-capturing paren 168 ++mCursor; 169 group_expr = parse_alt(); 170 break; 171 case '=': 172 ++mCursor; 173 group_expr = makeLookAheadAssertion(parse_alt()); 174 break; 175 case '!': 176 ++mCursor; 177 group_expr = makeNegativeLookAheadAssertion(parse_alt()); 178 break; 179 case '>': 180 ++mCursor; 181 group_expr = makeAtomicGroup(parse_alt()); 182 break; 183 case '|': 184 ++mCursor; 185 group_expr = makeBranchResetGroup(parse_alt()); 186 break; 187 case '<': 188 ++mCursor; 189 if (*mCursor == '=') { 190 ++mCursor; 191 group_expr = makeLookBehindAssertion(parse_alt()); 192 } 193 else if (*mCursor == '!') { 194 ++mCursor; 195 group_expr = makeNegativeLookBehindAssertion(parse_alt()); 196 } else { 197 ParseFailure("Illegal lookbehind assertion syntax."); 198 } 199 break; 200 case '-': case 'd' : case 'i': case 'm': case 's': case 'x': case 'g': 201 while (*mCursor != ')' && *mCursor != ':') { 202 bool negateMode = false; 203 ModeFlagType modeBit; 204 if (*mCursor == '-') { 205 negateMode = true; 206 ++mCursor; 207 } 208 switch (*mCursor) { 209 case 'i': modeBit = CASE_INSENSITIVE_MODE_FLAG; break; 210 case 'g': modeBit = GRAPHEME_CLUSTER_MODE; break; 211 case 'm': modeBit = MULTILINE_MODE_FLAG; break; 212 //case 's': modeBit = DOTALL_MODE_FLAG; break; 213 case 'x': modeBit = IGNORE_SPACE_MODE_FLAG; break; 214 case 'd': modeBit = UNIX_LINES_MODE_FLAG; break; 215 default: ParseFailure("Unsupported mode flag."); 216 } 217 ++mCursor; 218 if (negateMode) { 219 fModeFlagSet &= ~modeBit; 220 negateMode = false; // for next flag 221 } else { 222 fModeFlagSet |= modeBit; 223 } 224 } 225 if (*mCursor == ':') { 226 ++mCursor; 227 group_expr = parse_alt(); 228 auto changed = fModeFlagSet ^ savedModeFlagSet; 229 if ((changed & CASE_INSENSITIVE_MODE_FLAG) != 0) { 230 group_expr = makeGroup(Group::Mode::CaseInsensitiveMode, group_expr, 231 (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) == 0 ? Group::Sense::Off : Group::Sense::On); 232 } 233 if ((changed & GRAPHEME_CLUSTER_MODE) != 0) { 234 group_expr = makeGroup(Group::Mode::GraphemeMode, group_expr, 235 (fModeFlagSet & GRAPHEME_CLUSTER_MODE) == 0 ? Group::Sense::Off : Group::Sense::On); 236 } 237 fModeFlagSet = savedModeFlagSet; 238 break; 239 } else { // if *_cursor == ')' 240 ++mCursor; 241 auto changed = fModeFlagSet ^ savedModeFlagSet; 242 if ((changed & (CASE_INSENSITIVE_MODE_FLAG|GRAPHEME_CLUSTER_MODE)) != 0) { 243 group_expr = parse_seq(); 244 if ((changed & CASE_INSENSITIVE_MODE_FLAG) != 0) { 245 group_expr = makeGroup(Group::Mode::CaseInsensitiveMode, group_expr, 246 (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) == 0 ? Group::Sense::Off : Group::Sense::On); 247 } 248 if ((changed & GRAPHEME_CLUSTER_MODE) != 0) { 249 group_expr = makeGroup(Group::Mode::GraphemeMode, group_expr, 250 (fModeFlagSet & GRAPHEME_CLUSTER_MODE) == 0 ? Group::Sense::Off : Group::Sense::On); 251 } 252 return group_expr; 253 } 254 else return parse_next_item(); 255 } 256 default: 257 ParseFailure("Illegal (? syntax."); 216 if (accept('?')) { 217 if (accept('#')) { 218 while (mCursor.more() && !at(')')) ++mCursor; 219 group_expr = makeSeq(); 220 } else if (accept(':')) { // Non-capturing paren 221 group_expr = parse_alt(); 222 } else if (accept('=')) { // positive look ahead 223 group_expr = makeLookAheadAssertion(parse_alt()); 224 } else if (accept('!')) { // negative look ahead 225 group_expr = makeNegativeLookAheadAssertion(parse_alt()); 226 } else if (accept("<=")) { // positive look ahead 227 group_expr = makeLookBehindAssertion(parse_alt()); 228 } else if (accept("<!")) { // negative look ahead 229 group_expr = makeNegativeLookBehindAssertion(parse_alt()); 230 } else if (accept('>')) { // negative look ahead 231 group_expr = makeAtomicGroup(parse_alt()); 232 } else if (accept('|')) { // negative look ahead 233 group_expr = makeBranchResetGroup(parse_alt()); 234 } else if (atany("-dimsxg")) { // mode switches 235 bool closing_paren_parsed; 236 group_expr = parse_mode_group(closing_paren_parsed); 237 if (closing_paren_parsed) { 238 mGroupsOpen--; 239 return group_expr; 240 } 241 } else { 242 ParseFailure("Illegal (? syntax."); 258 243 } 259 244 } else { // Capturing paren group. 260 RE * captured = parse_alt(); 261 mCaptureGroupCount++; 262 std::string captureName = "\\" + std::to_string(mCaptureGroupCount); 263 Name * const capture = mMemoizer.memoize(makeCapture(captureName, captured)); 264 auto key = std::make_pair("", captureName); 265 mNameMap.insert(std::make_pair(std::move(key), capture)); 266 group_expr = capture; 267 } 268 if (*mCursor != ')') { 269 ParseFailure("Closing parenthesis required."); 270 } 245 group_expr = parse_capture_body(); 246 } 247 require(')'); 271 248 mGroupsOpen--; 272 ++mCursor;273 249 return group_expr; 274 250 } 275 276 // TODO: Set ENABLE_EXTENDED_QUANTIFIERS false for ERE, BRE 251 252 RE * RE_Parser::parse_capture_body() { 253 RE * captured = parse_alt(); 254 mCaptureGroupCount++; 255 std::string captureName = "\\" + std::to_string(mCaptureGroupCount); 256 Name * const capture = mMemoizer.memoize(makeCapture(captureName, captured)); 257 auto key = std::make_pair("", captureName); 258 mNameMap.insert(std::make_pair(std::move(key), capture)); 259 return capture; 260 } 261 262 RE * RE_Parser::parse_back_reference() { 263 mCursor++; 264 std::string backref = std::string(mCursor.pos()-2, mCursor.pos()); 265 auto key = std::make_pair("", backref); 266 auto f = mNameMap.find(key); 267 if (f != mNameMap.end()) { 268 return makeReference(backref, f->second); 269 } 270 else { 271 ParseFailure("Back reference " + backref + " without prior capture group."); 272 } 273 } 274 277 275 #define ENABLE_EXTENDED_QUANTIFIERS true 278 276 … … 315 313 lb = parse_int(); 316 314 if (accept('}')) return std::make_pair(lb, lb); 317 if (!accept(',')) ParseFailure("Expecting , or }");315 else require(','); 318 316 if (accept('}')) return std::make_pair(lb, Rep::UNBOUNDED_REP); 319 317 ub = parse_int(); 320 318 if (ub < lb) ParseFailure("Upper bound less than lower bound"); 321 319 } 322 if (accept('}')) return std::make_pair(lb, ub);323 else ParseFailure("Expecting }");320 require('}'); 321 return std::make_pair(lb, ub); 324 322 } 325 323 326 324 unsigned RE_Parser::parse_int() { 327 325 unsigned value = 0; 326 if (!isdigit(*mCursor)) ParseFailure("Expecting integer"); 328 327 while (isdigit(*mCursor)) { 329 328 value *= 10; … … 347 346 return parseEscapedSet(); 348 347 } 349 else if ( (*mCursor == 'x') || (*mCursor == 'o') || (*mCursor == '0')) {348 else if (atany("xo0")) { 350 349 codepoint_t cp = parse_escaped_codepoint(); 351 350 if ((cp >= 0x80) && (cp <= 0xFF)) { … … 355 354 } 356 355 else if (isdigit(*mCursor)) { 357 mCursor++; 358 std::string backref = std::string(mCursor.pos()-2, mCursor.pos()); 359 auto key = std::make_pair("", backref); 360 auto f = mNameMap.find(key); 361 if (f != mNameMap.end()) { 362 return makeReference(backref, f->second); 363 } 364 else { 365 ParseFailure("Back reference " + backref + " without prior capture group."); 366 } 356 return parse_back_reference(); 367 357 } 368 358 else { … … 625 615 626 616 Name * RE_Parser::parseNamePatternExpression(){ 627 628 if (!accept('{')) ParseFailure("Expecting { after \\N"); 617 require('{'); 629 618 const auto start = mCursor.pos(); 630 619 while (mCursor.more()) { … … 641 630 } 642 631 std::string nameRegexp = "/(?i)" + std::string(start, mCursor.pos()); 643 if (!accept('}')) ParseFailure("Expecting } after \\N{...");632 require('}'); 644 633 return createName("na", nameRegexp); 645 634 } … … 663 652 else have_new_expr = false; 664 653 } 665 if (!accept(']')) ParseFailure("Expecting ]");654 require(']'); 666 655 if (negated) return makeComplement(t1); 667 656 else return t1; … … 715 704 } 716 705 std::string name = std::string(start, mCursor.pos()); 717 if (!accept("=]")) ParseFailure("Posix equivalence class improperly terminated.");706 require("=]"); 718 707 return createName(name); 719 708 } … … 724 713 } 725 714 std::string name = std::string(start, mCursor.pos()); 726 if (!accept(".]")) ParseFailure("Posix equivalence class improperly terminated.");715 require(".]"); 727 716 return createName(name); 728 717 } … … 731 720 bool negated = accept('^'); 732 721 RE * posixSet = parsePropertyExpression(); 733 if (!accept(":]")) ParseFailure("Posix set expression improperly terminated.");722 require(":]"); 734 723 if (negated) return makeComplement(posixSet); 735 724 else return posixSet; … … 777 766 if (!accept('{')) ParseFailure("Malformed octal escape sequence"); 778 767 cp_value = parse_octal_codepoint(1, 7); 779 if (!accept('}')) ParseFailure("Malformed octal escape sequence");768 require('}'); 780 769 return cp_value; 781 770 } else if (accept('x')) { 782 771 if (!accept('{')) return parse_hex_codepoint(1,2); // ICU compatibility 783 772 cp_value = parse_hex_codepoint(1, 6); 784 if (!accept('}')) ParseFailure("Malformed hex escape sequence");773 require('}'); 785 774 return cp_value; 786 775 } else if (accept('u')) { 787 776 if (!accept('{')) return parse_hex_codepoint(4,4); // ICU compatibility 788 777 cp_value = parse_hex_codepoint(1, 6); 789 if (!accept('}')) ParseFailure("Malformed hex escape sequence");778 require('}'); 790 779 return cp_value; 791 780 } else if (accept('U')) { -
icGREP/icgrep-devel/icgrep/re/re_parser.h
r5787 r5789 8 8 #define RE_PARSER_H 9 9 10 #include <map> // for map 11 #include <re/re_memoizer.hpp> // for Memoizer 12 #include "re/re_cc.h" // for codepoint_t, CC (ptr only) 10 #include <map> 11 #include <re/re_memoizer.hpp> 12 #include "re/re_cc.h" 13 13 14 namespace re { class Name; } 14 15 … … 17 18 enum RE_Syntax {FixedStrings, BRE, ERE, PCRE, PROSITE}; 18 19 19 enum CharsetOperatorKind20 {intersectOp, setDiffOp, ampChar, hyphenChar, rangeHyphen, posixPropertyOpener, setOpener, setCloser, backSlash, emptyOperator};21 22 20 enum ModeFlagType : unsigned { 23 21 DEFAULT_MODE = 0, … … 116 114 } 117 115 116 inline void require(char c) { 117 if (!accept(c)) { 118 if (mCursor.noMore()) ParseFailure("Expecting " + std::string(1, c) + " but end of input encountered"); 119 ParseFailure("Expecting " + std::string(1, c) + " but " + std::string(1, *mCursor) + " encountered"); 120 } 121 } 122 118 123 inline bool atany(std::string s) { 119 124 if (mCursor.noMore()) return false; … … 134 139 135 140 inline bool accept(std::string s) { 141 Cursor tmp = mCursor; 136 142 for (unsigned i = 0; i < s.length(); i++) { 137 if (mCursor.noMore() || (s[i] != *mCursor)) return false; 138 mCursor++; 139 } 143 if (tmp.noMore() || (s[i] != *tmp)) return false; 144 tmp++; 145 } 146 mCursor = tmp; 140 147 return true; 141 148 } 142 149 143 150 inline void require(std::string s) { 151 if (!accept(s)) { 152 if (mCursor.noMore()) ParseFailure("Expecting " + s + " but end of input encountered"); 153 unsigned long rem = mCursor.remaining(); 154 ParseFailure("Expecting " + s + " but " + std::string(mCursor.pos(), mCursor.pos() + std::min(rem, s.length())) + " encountered"); 155 } 156 } 157 144 158 RE_Parser(const std::string & regular_expression); 145 159 … … 155 169 156 170 virtual RE * parse_group(); 157 171 172 RE * parse_mode_group(bool & closing_paren_parsed); 173 174 RE * parse_capture_body(); 175 176 RE * parse_back_reference(); 177 158 178 virtual bool isSetEscapeChar(char c); 159 179 … … 223 243 bool fNested; 224 244 unsigned mGroupsOpen; 225 bool fSupportNonCaptureGroup;226 245 Cursor mCursor; 227 246 unsigned mCaptureGroupCount; -
icGREP/icgrep-devel/icgrep/re/re_parser_bre.cpp
r5787 r5789 13 13 #include <re/re_assertion.h> 14 14 #include <re/re_rep.h> 15 #include <llvm/Support/raw_ostream.h> 15 16 16 17 … … 29 30 RE * RE_Parser_BRE::parse_seq() { 30 31 std::vector<RE *> seq; 31 if (!mCursor.more() || at("\\|") || ((mGroupsOpen > 0) && at("\\)"))) return makeSeq();32 if (!mCursor.more() || at("\\|") || at("\\)")) return makeSeq(); 32 33 for (;;) { 33 34 RE * re = parse_next_item(); … … 54 55 } 55 56 56 // A parenthesized group. Input precondition: the opening( has been consumed57 // A parenthesized capture group. Input precondition: the opening \( has been consumed 57 58 RE * RE_Parser_BRE::parse_group() { 58 // Capturing paren group.59 59 mGroupsOpen++; 60 RE * captured = parse_alt(); 61 mCaptureGroupCount++; 62 std::string captureName = "\\" + std::to_string(mCaptureGroupCount); 63 Name * const capture = mMemoizer.memoize(makeCapture(captureName, captured)); 64 auto key = std::make_pair("", captureName); 65 mNameMap.insert(std::make_pair(std::move(key), capture)); 66 if (!accept("\\)")) ParseFailure("Closing parenthesis required."); 60 RE * captured = parse_capture_body(); 61 require("\\)"); 67 62 mGroupsOpen--; 68 return capture ;63 return captured; 69 64 } 70 65 … … 93 88 lb = parse_int(); 94 89 if (accept("\\}")) return std::make_pair(lb, lb); 95 if (!accept(',')) ParseFailure("Expecting , or }");90 else require(','); 96 91 if (accept("\\}")) return std::make_pair(lb, Rep::UNBOUNDED_REP); 97 92 ub = parse_int(); 98 93 if (ub < lb) ParseFailure("Upper bound less than lower bound"); 99 94 } 100 if (accept("\\}")) return std::make_pair(lb, ub);101 else ParseFailure("Expecting \\}");95 require("\\}"); 96 return std::make_pair(lb, ub); 102 97 } 103 98 -
icGREP/icgrep-devel/icgrep/re/re_parser_ere.cpp
r5787 r5789 27 27 } 28 28 29 // A parenthesized group. Input precondition: the opening ( has been consumed29 // A parenthesized capture group. Input precondition: the opening ( has been consumed 30 30 RE * RE_Parser_ERE::parse_group() { 31 // Capturing paren group.32 31 mGroupsOpen++; 33 RE * captured = parse_alt(); 34 mCaptureGroupCount++; 35 std::string captureName = "\\" + std::to_string(mCaptureGroupCount); 36 Name * const capture = mMemoizer.memoize(makeCapture(captureName, captured)); 37 auto key = std::make_pair("", captureName); 38 mNameMap.insert(std::make_pair(std::move(key), capture)); 39 if (!accept(')')) ParseFailure("Closing parenthesis required."); 32 RE * captured = parse_capture_body(); 33 require(')'); 40 34 mGroupsOpen--; 41 return capture ;35 return captured; 42 36 } 43 37 … … 49 43 if (accept('<')) return makeWordBegin(); 50 44 if (accept('>')) return makeWordEnd(); 51 if (isdigit(*mCursor)) { 52 mCursor++; 53 std::string backref = std::string(mCursor.pos()-2, mCursor.pos()); 54 auto key = std::make_pair("", backref); 55 auto f = mNameMap.find(key); 56 if (f != mNameMap.end()) { 57 return makeReference(backref, f->second); 58 } 59 else { 60 ParseFailure("Back reference " + backref + " without prior capture group."); 61 } 62 } 45 if (isdigit(*mCursor)) return parse_back_reference(); 63 46 else { 64 47 return createCC(parse_literal_codepoint()); … … 70 53 // Items represent individual characters or sets of characters. 71 54 // Ranges may be formed by individual character items separated by '-'. 55 // Note that there are no backslash escapes for ERE or BRE bracket expressions. 72 56 RE * RE_Parser_ERE::parse_bracket_expr () { 73 57 bool negated = accept('^'); … … 84 68 } while (mCursor.more() && !at(']')); 85 69 RE * t = makeAlt(items.begin(), items.end()); 86 if (!accept(']')) ParseFailure("Expecting ]");70 require(']'); 87 71 if (negated) return makeComplement(t); 88 72 else return t; -
icGREP/icgrep-devel/icgrep/re/re_parser_pcre.h
r5267 r5789 14 14 public: 15 15 RE_Parser_PCRE(const std::string & regular_expression) : RE_Parser(regular_expression) { 16 fSupportNonCaptureGroup = true;17 16 mReSyntax = RE_Syntax ::PCRE; 18 17 }
Note: See TracChangeset
for help on using the changeset viewer.