[3850] | 1 | /* |
---|
[5769] | 2 | * Copyright (c) 2017 International Characters. |
---|
[3850] | 3 | * This software is licensed to the public under the Open Software License 3.0. |
---|
| 4 | * icgrep is a trademark of International Characters. |
---|
| 5 | */ |
---|
| 6 | |
---|
[4255] | 7 | #include <re/re_parser.h> |
---|
[5180] | 8 | #include <re/re_parser_helper.h> |
---|
| 9 | #include <re/re_parser_pcre.h> |
---|
| 10 | #include <re/re_parser_ere.h> |
---|
| 11 | #include <re/re_parser_bre.h> |
---|
[5218] | 12 | #include <re/re_parser_prosite.h> |
---|
[5753] | 13 | #include <re/parse_fixed_strings.h> |
---|
[4307] | 14 | #include <re/re_name.h> |
---|
[4255] | 15 | #include <re/re_alt.h> |
---|
[5267] | 16 | #include <re/re_any.h> |
---|
[4255] | 17 | #include <re/re_end.h> |
---|
| 18 | #include <re/re_rep.h> |
---|
| 19 | #include <re/re_seq.h> |
---|
| 20 | #include <re/re_start.h> |
---|
[5769] | 21 | #include <re/re_range.h> |
---|
[4255] | 22 | #include <re/re_diff.h> |
---|
[4298] | 23 | #include <re/re_intersect.h> |
---|
[5769] | 24 | #include <re/re_group.h> |
---|
[4405] | 25 | #include <re/re_assertion.h> |
---|
[4806] | 26 | #include <re/printer_re.h> |
---|
[4671] | 27 | #include <sstream> |
---|
[5080] | 28 | #include <string> |
---|
[4182] | 29 | #include <algorithm> |
---|
[5241] | 30 | #include <iostream> |
---|
[5267] | 31 | #include <llvm/Support/Casting.h> |
---|
| 32 | #include <llvm/Support/ErrorHandling.h> |
---|
[5620] | 33 | #include <llvm/Support/raw_ostream.h> |
---|
[5698] | 34 | #include <llvm/ADT/STLExtras.h> // for make_unique |
---|
[3850] | 35 | |
---|
[5267] | 36 | using namespace llvm; |
---|
| 37 | |
---|
[5180] | 38 | namespace re { |
---|
[4309] | 39 | |
---|
[5267] | 40 | |
---|
[5554] | 41 | RE * RE_Parser::parse(const std::string & regular_expression, ModeFlagSet initialFlags, RE_Syntax syntax, bool ByteMode) { |
---|
[5620] | 42 | |
---|
[5180] | 43 | std::unique_ptr<RE_Parser> parser = nullptr; |
---|
| 44 | switch (syntax) { |
---|
| 45 | case RE_Syntax::PCRE: |
---|
[5267] | 46 | parser = make_unique<RE_Parser_PCRE>(regular_expression); |
---|
[5180] | 47 | break; |
---|
| 48 | case RE_Syntax::ERE: |
---|
[5267] | 49 | parser = make_unique<RE_Parser_ERE>(regular_expression); |
---|
[5180] | 50 | break; |
---|
| 51 | case RE_Syntax ::BRE: |
---|
[5267] | 52 | parser = make_unique<RE_Parser_BRE>(regular_expression); |
---|
[5180] | 53 | break; |
---|
[5218] | 54 | case RE_Syntax ::PROSITE: |
---|
[5267] | 55 | parser = make_unique<RE_Parser_PROSITE>(regular_expression); |
---|
[5218] | 56 | break; |
---|
[5180] | 57 | default: |
---|
[5753] | 58 | parser = make_unique<FixedStringParser>(regular_expression); |
---|
[5180] | 59 | break; |
---|
| 60 | } |
---|
[5554] | 61 | parser->fByteMode = ByteMode; |
---|
[5180] | 62 | parser->fModeFlagSet = initialFlags; |
---|
[5752] | 63 | parser->mGroupsOpen = 0; |
---|
[5180] | 64 | parser->fNested = false; |
---|
| 65 | parser->fGraphemeBoundaryPending = false; |
---|
| 66 | parser->mCaptureGroupCount = 0; |
---|
| 67 | RE * re = parser->parse_RE(); |
---|
[4182] | 68 | if (re == nullptr) { |
---|
[5161] | 69 | ParseFailure("An unexpected parsing error occurred!"); |
---|
[4182] | 70 | } |
---|
| 71 | return re; |
---|
| 72 | } |
---|
[3850] | 73 | |
---|
[5182] | 74 | RE_Parser::RE_Parser(const std::string & regular_expression) |
---|
[5554] | 75 | : fByteMode(false) |
---|
[5786] | 76 | , fModeFlagSet(MULTILINE_MODE_FLAG) |
---|
[5234] | 77 | , fNested(false) |
---|
[5752] | 78 | , mGroupsOpen(0) |
---|
[5234] | 79 | , fGraphemeBoundaryPending(false) |
---|
| 80 | , fSupportNonCaptureGroup(false) |
---|
| 81 | , mCursor(regular_expression) |
---|
| 82 | , mCaptureGroupCount(0) |
---|
| 83 | , mReSyntax(RE_Syntax::PCRE) |
---|
| 84 | { |
---|
[4798] | 85 | |
---|
[5234] | 86 | } |
---|
[4798] | 87 | |
---|
[4311] | 88 | RE * makeAtomicGroup(RE * r) { |
---|
[5161] | 89 | RE_Parser::ParseFailure("Atomic grouping not supported."); |
---|
[4311] | 90 | } |
---|
| 91 | |
---|
| 92 | RE * makeBranchResetGroup(RE * r) { |
---|
| 93 | // Branch reset groups only affect submatch numbering, but |
---|
| 94 | // this has no effect in icgrep. |
---|
| 95 | return r; |
---|
| 96 | } |
---|
[4798] | 97 | |
---|
[4311] | 98 | RE * RE_Parser::parse_RE() { |
---|
[4841] | 99 | return parse_alt(); |
---|
[4311] | 100 | } |
---|
| 101 | |
---|
| 102 | RE * RE_Parser::parse_alt() { |
---|
[4203] | 103 | std::vector<RE *> alt; |
---|
[5753] | 104 | do { |
---|
[5737] | 105 | alt.push_back(parse_seq()); |
---|
[3850] | 106 | } |
---|
[5754] | 107 | while (accept_alt_mark()); |
---|
[4203] | 108 | return makeAlt(alt.begin(), alt.end()); |
---|
[3850] | 109 | } |
---|
[5754] | 110 | |
---|
| 111 | bool RE_Parser::accept_alt_mark() { |
---|
| 112 | if (!mCursor.more() || (*mCursor != '|')) return false; |
---|
| 113 | mCursor++; |
---|
| 114 | return true; |
---|
| 115 | } |
---|
[3850] | 116 | |
---|
[5182] | 117 | RE * RE_Parser::parse_seq() { |
---|
[4203] | 118 | std::vector<RE *> seq; |
---|
[5752] | 119 | if (!mCursor.more() || (*mCursor == '|') || ((mGroupsOpen > 0) && (*mCursor == ')'))) return makeSeq(); |
---|
[4182] | 120 | for (;;) { |
---|
[4311] | 121 | RE * re = parse_next_item(); |
---|
[4182] | 122 | if (re == nullptr) { |
---|
| 123 | break; |
---|
[3850] | 124 | } |
---|
[4852] | 125 | re = extend_item(re); |
---|
| 126 | seq.push_back(re); |
---|
[3850] | 127 | } |
---|
[4249] | 128 | return makeSeq(seq.begin(), seq.end()); |
---|
[3850] | 129 | } |
---|
| 130 | |
---|
[4311] | 131 | RE * RE_Parser::parse_next_item() { |
---|
[5091] | 132 | RE * re = nullptr; |
---|
[5537] | 133 | if (fModeFlagSet & IGNORE_SPACE_MODE_FLAG) { |
---|
[5630] | 134 | while (mCursor.more() && *mCursor == ' ') { |
---|
| 135 | ++mCursor; |
---|
| 136 | } |
---|
[5537] | 137 | } |
---|
| 138 | if (mCursor.more()) { |
---|
[4829] | 139 | switch (*mCursor) { |
---|
[4182] | 140 | case '(': |
---|
[4829] | 141 | ++mCursor; |
---|
[4311] | 142 | return parse_group(); |
---|
[4182] | 143 | case '^': |
---|
[4829] | 144 | ++mCursor; |
---|
[5786] | 145 | if ((fModeFlagSet & ModeFlagType::MULTILINE_MODE_FLAG) == 0) { |
---|
| 146 | return makeZeroWidth("^s"); //single-line mode |
---|
| 147 | } |
---|
[5558] | 148 | if ((fModeFlagSet & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) { |
---|
| 149 | return makeNegativeLookBehindAssertion(makeByte(makeCC(makeCC(0, '\n'-1), makeCC('\n'+1, 0xFF)))); |
---|
| 150 | } |
---|
[4309] | 151 | return makeStart(); |
---|
[4182] | 152 | case '$': |
---|
[4829] | 153 | ++mCursor; |
---|
[5786] | 154 | if ((fModeFlagSet & ModeFlagType::MULTILINE_MODE_FLAG) == 0) { |
---|
| 155 | return makeZeroWidth("$s"); //single-line mode |
---|
| 156 | } |
---|
[5558] | 157 | if ((fModeFlagSet & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) { |
---|
| 158 | return makeLookAheadAssertion(makeCC('\n')); |
---|
| 159 | } |
---|
[4309] | 160 | return makeEnd(); |
---|
[5752] | 161 | case '|': |
---|
[4841] | 162 | break; |
---|
[5752] | 163 | case ')': |
---|
| 164 | if (mGroupsOpen > 0) break; |
---|
| 165 | if (LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) { |
---|
| 166 | return createCC(parse_literal_codepoint()); |
---|
| 167 | } |
---|
| 168 | ParseFailure("Use \\) for literal )."); |
---|
[4798] | 169 | case '*': case '+': case '?': case '{': |
---|
[5161] | 170 | ParseFailure("Need something to repeat before *, +, ? or {."); |
---|
[4798] | 171 | case ']': |
---|
[4310] | 172 | if (LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) { |
---|
[5554] | 173 | return createCC(parse_literal_codepoint()); |
---|
[4310] | 174 | } |
---|
[5161] | 175 | ParseFailure("Use \\] for literal ]."); |
---|
[4798] | 176 | case '}': |
---|
| 177 | if (fNested) { |
---|
[4841] | 178 | break; // a recursive invocation for a regexp in \N{...} |
---|
| 179 | } else if (LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) { |
---|
[5554] | 180 | return createCC(parse_literal_codepoint()); |
---|
[4798] | 181 | } |
---|
[5161] | 182 | ParseFailure("Use \\} for literal }."); |
---|
[4182] | 183 | case '[': |
---|
[4829] | 184 | mCursor++; |
---|
[5091] | 185 | re = parse_charset(); |
---|
[5630] | 186 | break; |
---|
[4182] | 187 | case '.': // the 'any' metacharacter |
---|
[4829] | 188 | mCursor++; |
---|
[4311] | 189 | return makeAny(); |
---|
[4310] | 190 | case '\\': // escape processing |
---|
[4829] | 191 | ++mCursor; |
---|
[4309] | 192 | return parse_escaped(); |
---|
[4182] | 193 | default: |
---|
[5554] | 194 | re = createCC(parse_literal_codepoint()); |
---|
[4182] | 195 | } |
---|
[3850] | 196 | } |
---|
[5630] | 197 | return re; |
---|
[4182] | 198 | } |
---|
[4798] | 199 | |
---|
| 200 | |
---|
[4841] | 201 | // Parse some kind of parenthesized group. Input precondition: mCursor |
---|
[4311] | 202 | // after the ( |
---|
| 203 | RE * RE_Parser::parse_group() { |
---|
[5752] | 204 | const ModeFlagSet savedModeFlagSet = fModeFlagSet; |
---|
| 205 | mGroupsOpen++; |
---|
[4841] | 206 | RE * group_expr = nullptr; |
---|
[5180] | 207 | if (*mCursor == '?' && fSupportNonCaptureGroup) { |
---|
[4829] | 208 | switch (*++mCursor) { |
---|
[4311] | 209 | case '#': // comment |
---|
[4841] | 210 | while (*++mCursor != ')'); |
---|
[4829] | 211 | ++mCursor; |
---|
[4311] | 212 | return parse_next_item(); |
---|
| 213 | case ':': // Non-capturing paren |
---|
[4829] | 214 | ++mCursor; |
---|
[4311] | 215 | group_expr = parse_alt(); |
---|
| 216 | break; |
---|
| 217 | case '=': |
---|
[4829] | 218 | ++mCursor; |
---|
[4841] | 219 | group_expr = makeLookAheadAssertion(parse_alt()); |
---|
[4311] | 220 | break; |
---|
| 221 | case '!': |
---|
[4829] | 222 | ++mCursor; |
---|
[4841] | 223 | group_expr = makeNegativeLookAheadAssertion(parse_alt()); |
---|
[4311] | 224 | break; |
---|
| 225 | case '>': |
---|
[4829] | 226 | ++mCursor; |
---|
[4841] | 227 | group_expr = makeAtomicGroup(parse_alt()); |
---|
[4311] | 228 | break; |
---|
| 229 | case '|': |
---|
[4829] | 230 | ++mCursor; |
---|
[4841] | 231 | group_expr = makeBranchResetGroup(parse_alt()); |
---|
[4311] | 232 | break; |
---|
| 233 | case '<': |
---|
[4829] | 234 | ++mCursor; |
---|
| 235 | if (*mCursor == '=') { |
---|
| 236 | ++mCursor; |
---|
[4841] | 237 | group_expr = makeLookBehindAssertion(parse_alt()); |
---|
[4311] | 238 | } |
---|
[4829] | 239 | else if (*mCursor == '!') { |
---|
| 240 | ++mCursor; |
---|
[4841] | 241 | group_expr = makeNegativeLookBehindAssertion(parse_alt()); |
---|
[4829] | 242 | } else { |
---|
[5161] | 243 | ParseFailure("Illegal lookbehind assertion syntax."); |
---|
[4311] | 244 | } |
---|
| 245 | break; |
---|
[4841] | 246 | case '-': case 'd' : case 'i': case 'm': case 's': case 'x': case 'g': |
---|
[4831] | 247 | while (*mCursor != ')' && *mCursor != ':') { |
---|
[4841] | 248 | bool negateMode = false; |
---|
| 249 | ModeFlagType modeBit; |
---|
[4829] | 250 | if (*mCursor == '-') { |
---|
[4311] | 251 | negateMode = true; |
---|
[4829] | 252 | ++mCursor; |
---|
[4311] | 253 | } |
---|
[4841] | 254 | switch (*mCursor) { |
---|
[4311] | 255 | case 'i': modeBit = CASE_INSENSITIVE_MODE_FLAG; break; |
---|
[4831] | 256 | case 'g': modeBit = GRAPHEME_CLUSTER_MODE; break; |
---|
[5537] | 257 | case 'm': modeBit = MULTILINE_MODE_FLAG; break; |
---|
[4429] | 258 | //case 's': modeBit = DOTALL_MODE_FLAG; break; |
---|
[5537] | 259 | case 'x': modeBit = IGNORE_SPACE_MODE_FLAG; break; |
---|
[5558] | 260 | case 'd': modeBit = UNIX_LINES_MODE_FLAG; break; |
---|
[5161] | 261 | default: ParseFailure("Unsupported mode flag."); |
---|
[4311] | 262 | } |
---|
[4841] | 263 | ++mCursor; |
---|
[4311] | 264 | if (negateMode) { |
---|
[4312] | 265 | fModeFlagSet &= ~modeBit; |
---|
[4311] | 266 | negateMode = false; // for next flag |
---|
[4829] | 267 | } else { |
---|
| 268 | fModeFlagSet |= modeBit; |
---|
[4311] | 269 | } |
---|
| 270 | } |
---|
[4829] | 271 | if (*mCursor == ':') { |
---|
| 272 | ++mCursor; |
---|
[4311] | 273 | group_expr = parse_alt(); |
---|
[5769] | 274 | auto changed = fModeFlagSet ^ savedModeFlagSet; |
---|
| 275 | if ((changed & CASE_INSENSITIVE_MODE_FLAG) != 0) { |
---|
| 276 | group_expr = makeGroup(Group::Mode::CaseInsensitiveMode, group_expr, |
---|
| 277 | (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) == 0 ? Group::Sense::Off : Group::Sense::On); |
---|
| 278 | } |
---|
[5772] | 279 | if ((changed & GRAPHEME_CLUSTER_MODE) != 0) { |
---|
| 280 | group_expr = makeGroup(Group::Mode::GraphemeMode, group_expr, |
---|
| 281 | (fModeFlagSet & GRAPHEME_CLUSTER_MODE) == 0 ? Group::Sense::Off : Group::Sense::On); |
---|
| 282 | } |
---|
[5752] | 283 | fModeFlagSet = savedModeFlagSet; |
---|
[4841] | 284 | break; |
---|
[4829] | 285 | } else { // if *_cursor == ')' |
---|
| 286 | ++mCursor; |
---|
[5769] | 287 | auto changed = fModeFlagSet ^ savedModeFlagSet; |
---|
[5772] | 288 | if ((changed & (CASE_INSENSITIVE_MODE_FLAG|GRAPHEME_CLUSTER_MODE)) != 0) { |
---|
[5769] | 289 | group_expr = parse_seq(); |
---|
[5772] | 290 | if ((changed & CASE_INSENSITIVE_MODE_FLAG) != 0) { |
---|
| 291 | group_expr = makeGroup(Group::Mode::CaseInsensitiveMode, group_expr, |
---|
| 292 | (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) == 0 ? Group::Sense::Off : Group::Sense::On); |
---|
| 293 | } |
---|
| 294 | if ((changed & GRAPHEME_CLUSTER_MODE) != 0) { |
---|
| 295 | group_expr = makeGroup(Group::Mode::GraphemeMode, group_expr, |
---|
| 296 | (fModeFlagSet & GRAPHEME_CLUSTER_MODE) == 0 ? Group::Sense::Off : Group::Sense::On); |
---|
| 297 | } |
---|
| 298 | return group_expr; |
---|
[5769] | 299 | } |
---|
| 300 | else return parse_next_item(); |
---|
[4311] | 301 | } |
---|
| 302 | default: |
---|
[5161] | 303 | ParseFailure("Illegal (? syntax."); |
---|
[4311] | 304 | } |
---|
[5080] | 305 | } else { // Capturing paren group. |
---|
| 306 | RE * captured = parse_alt(); |
---|
| 307 | mCaptureGroupCount++; |
---|
| 308 | std::string captureName = "\\" + std::to_string(mCaptureGroupCount); |
---|
| 309 | Name * const capture = mMemoizer.memoize(makeCapture(captureName, captured)); |
---|
| 310 | auto key = std::make_pair("", captureName); |
---|
| 311 | mNameMap.insert(std::make_pair(std::move(key), capture)); |
---|
| 312 | group_expr = capture; |
---|
[4311] | 313 | } |
---|
[4829] | 314 | if (*mCursor != ')') { |
---|
[5161] | 315 | ParseFailure("Closing parenthesis required."); |
---|
[4829] | 316 | } |
---|
[5752] | 317 | mGroupsOpen--; |
---|
[4829] | 318 | ++mCursor; |
---|
[4311] | 319 | return group_expr; |
---|
[3850] | 320 | } |
---|
[4798] | 321 | |
---|
[5515] | 322 | // TODO: Set ENABLE_EXTENDED_QUANTIFIERS false for ERE, BRE |
---|
| 323 | #define ENABLE_EXTENDED_QUANTIFIERS true |
---|
| 324 | |
---|
| 325 | // Extend a RE item with one or more quantifiers |
---|
[5182] | 326 | RE * RE_Parser::extend_item(RE * re) { |
---|
[5515] | 327 | if (LLVM_UNLIKELY(!(mCursor.more()))) return re; |
---|
| 328 | int lb = 0, ub = 0; |
---|
| 329 | switch (*mCursor) { |
---|
| 330 | case '*': |
---|
| 331 | lb = 0; |
---|
| 332 | ub = Rep::UNBOUNDED_REP; |
---|
| 333 | break; |
---|
| 334 | case '?': |
---|
| 335 | lb = 0; |
---|
| 336 | ub = 1; |
---|
| 337 | break; |
---|
| 338 | case '+': |
---|
| 339 | lb = 1; |
---|
| 340 | ub = Rep::UNBOUNDED_REP; |
---|
| 341 | break; |
---|
| 342 | case '{': |
---|
| 343 | std::tie(lb, ub) = parse_range_bound(); |
---|
[4841] | 344 | if ((ub != Rep::UNBOUNDED_REP) && (lb > ub)) { |
---|
[5161] | 345 | ParseFailure("Lower bound cannot exceed upper bound in bounded repetition"); |
---|
[4841] | 346 | } |
---|
[5515] | 347 | break; |
---|
| 348 | default: |
---|
| 349 | // No quantifiers |
---|
| 350 | return re; |
---|
| 351 | } |
---|
| 352 | ++mCursor; |
---|
| 353 | if (ENABLE_EXTENDED_QUANTIFIERS) { |
---|
| 354 | if (LLVM_UNLIKELY(!(mCursor.more()))) return makeRep(re, lb, ub);; |
---|
| 355 | if (*mCursor == '?') { // Non-greedy qualifier |
---|
| 356 | // Greedy vs. non-greedy makes no difference for icgrep. |
---|
[4831] | 357 | ++mCursor; |
---|
[5515] | 358 | re = makeRep(re, lb, ub); |
---|
| 359 | } else if (*mCursor == '+') { |
---|
| 360 | ++mCursor; |
---|
| 361 | if (ub == Rep::UNBOUNDED_REP) { |
---|
| 362 | re = makeSeq({makeRep(re, lb, ub), makeNegativeLookAheadAssertion(re)}); |
---|
| 363 | } else if (lb == ub) { |
---|
| 364 | re = makeRep(re, ub, ub); |
---|
| 365 | } else /* if (lb < ub) */{ |
---|
| 366 | re = makeAlt({makeSeq({makeRep(re, lb, ub-1), makeNegativeLookAheadAssertion(re)}), makeRep(re, ub, ub)}); |
---|
[4841] | 367 | } |
---|
[5515] | 368 | } else { |
---|
[4841] | 369 | re = makeRep(re, lb, ub); |
---|
[5515] | 370 | } |
---|
| 371 | } else { |
---|
| 372 | re = makeRep(re, lb, ub); |
---|
[3850] | 373 | } |
---|
[5515] | 374 | // The quantified expression may be extended with a further quantifier, e,g., [a-z]{6,7}{2,3} |
---|
| 375 | return extend_item(re); |
---|
[4829] | 376 | } |
---|
| 377 | |
---|
[5182] | 378 | std::pair<int, int> RE_Parser::parse_range_bound() { |
---|
[4829] | 379 | int lower_bound = 0, upper_bound = 0; |
---|
[5180] | 380 | if (*++mCursor != ',') { |
---|
[4182] | 381 | lower_bound = parse_int(); |
---|
[3850] | 382 | } |
---|
[4829] | 383 | if (*mCursor == '}') { |
---|
[4306] | 384 | upper_bound = lower_bound; |
---|
[4829] | 385 | } else if (*mCursor != ',') { |
---|
[5161] | 386 | ParseFailure("Bad lower bound!"); |
---|
[4831] | 387 | } else if (*++mCursor == '}') { |
---|
| 388 | upper_bound = Rep::UNBOUNDED_REP; |
---|
| 389 | } else { |
---|
| 390 | upper_bound = parse_int(); |
---|
| 391 | if (*mCursor != '}') { |
---|
[5161] | 392 | ParseFailure("Bad upper bound!"); |
---|
[3850] | 393 | } |
---|
[4182] | 394 | } |
---|
[4829] | 395 | return std::make_pair(lower_bound, upper_bound); |
---|
[3850] | 396 | } |
---|
| 397 | |
---|
[4309] | 398 | unsigned RE_Parser::parse_int() { |
---|
| 399 | unsigned value = 0; |
---|
[4831] | 400 | while (isdigit(*mCursor)) { |
---|
[4309] | 401 | value *= 10; |
---|
[4831] | 402 | value += static_cast<int>(*mCursor++) - 48; |
---|
[4309] | 403 | } |
---|
| 404 | return value; |
---|
| 405 | } |
---|
| 406 | |
---|
[3850] | 407 | |
---|
[5132] | 408 | const uint64_t setEscapeCharacters = bit3C('b') | bit3C('p') | bit3C('q') | bit3C('d') | bit3C('w') | bit3C('s') | bit3C('<') | bit3C('>') | |
---|
| 409 | bit3C('B') | bit3C('P') | bit3C('Q') | bit3C('D') | bit3C('W') | bit3C('S') | bit3C('N') | bit3C('X'); |
---|
[4307] | 410 | |
---|
[5182] | 411 | bool RE_Parser::isSetEscapeChar(char c) { |
---|
[5132] | 412 | return c >= 0x3C && c <= 0x7B && ((setEscapeCharacters >> (c - 0x3C)) & 1) == 1; |
---|
[4307] | 413 | } |
---|
[5080] | 414 | |
---|
[4307] | 415 | |
---|
[5182] | 416 | RE * RE_Parser::parse_escaped() { |
---|
[5080] | 417 | |
---|
[4830] | 418 | if (isSetEscapeChar(*mCursor)) { |
---|
| 419 | return parseEscapedSet(); |
---|
| 420 | } |
---|
[5558] | 421 | else if ((*mCursor == 'x') || (*mCursor == 'o') || (*mCursor == '0')) { |
---|
| 422 | codepoint_t cp = parse_escaped_codepoint(); |
---|
| 423 | if ((cp >= 0x80) && (cp <= 0xFF)) { |
---|
| 424 | return makeByte(makeCC(cp)); |
---|
| 425 | } |
---|
| 426 | else return createCC(cp); |
---|
| 427 | } |
---|
[5080] | 428 | else if (isdigit(*mCursor)) { |
---|
| 429 | mCursor++; |
---|
| 430 | std::string backref = std::string(mCursor.pos()-2, mCursor.pos()); |
---|
| 431 | auto key = std::make_pair("", backref); |
---|
| 432 | auto f = mNameMap.find(key); |
---|
| 433 | if (f != mNameMap.end()) { |
---|
| 434 | return makeReference(backref, f->second); |
---|
| 435 | } |
---|
| 436 | else { |
---|
[5161] | 437 | ParseFailure("Back reference " + backref + " without prior capture group."); |
---|
[5080] | 438 | } |
---|
| 439 | } |
---|
[4830] | 440 | else { |
---|
| 441 | return createCC(parse_escaped_codepoint()); |
---|
| 442 | } |
---|
[4307] | 443 | } |
---|
[4830] | 444 | |
---|
[4671] | 445 | RE * RE_Parser::parseEscapedSet() { |
---|
[4310] | 446 | bool complemented = false; |
---|
[4829] | 447 | RE * re = nullptr; |
---|
| 448 | switch (*mCursor) { |
---|
[4830] | 449 | case 'B': complemented = true; |
---|
[4402] | 450 | case 'b': |
---|
[4831] | 451 | if (*++mCursor != '{') { |
---|
[4830] | 452 | return complemented ? makeWordNonBoundary() : makeWordBoundary(); |
---|
[4831] | 453 | } else { |
---|
[5206] | 454 | ++mCursor; |
---|
| 455 | if (isCharAhead('}')) { |
---|
| 456 | switch (*mCursor) { |
---|
| 457 | case 'g': |
---|
| 458 | re = complemented ? makeZeroWidth("NonGCB") : makeZeroWidth("GCB"); |
---|
| 459 | ++mCursor; |
---|
| 460 | ++mCursor; |
---|
| 461 | break; |
---|
| 462 | case 'w': ParseFailure("\\b{w} not yet supported."); |
---|
| 463 | case 'l': ParseFailure("\\b{l} not yet supported."); |
---|
| 464 | case 's': ParseFailure("\\b{s} not yet supported."); |
---|
| 465 | // default: ParseFailure("Unrecognized boundary assertion"); |
---|
| 466 | } |
---|
[4830] | 467 | } |
---|
[5206] | 468 | if (!re) { |
---|
| 469 | auto propExpr = parsePropertyExpression(); |
---|
| 470 | if (*mCursor++ != '}') { |
---|
| 471 | ParseFailure("Malformed boundary assertion"); |
---|
| 472 | } |
---|
| 473 | re = complemented ? makeReNonBoundary(propExpr) : makeReBoundary(propExpr); |
---|
[4830] | 474 | } |
---|
[4841] | 475 | return re; |
---|
[4830] | 476 | } |
---|
[4310] | 477 | case 'd': |
---|
[4829] | 478 | ++mCursor; |
---|
[4310] | 479 | return makeDigitSet(); |
---|
| 480 | case 'D': |
---|
[4829] | 481 | ++mCursor; |
---|
[4310] | 482 | return makeComplement(makeDigitSet()); |
---|
| 483 | case 's': |
---|
[4829] | 484 | ++mCursor; |
---|
[4310] | 485 | return makeWhitespaceSet(); |
---|
| 486 | case 'S': |
---|
[4829] | 487 | ++mCursor; |
---|
[4310] | 488 | return makeComplement(makeWhitespaceSet()); |
---|
| 489 | case 'w': |
---|
[4829] | 490 | ++mCursor; |
---|
[4310] | 491 | return makeWordSet(); |
---|
| 492 | case 'W': |
---|
[4829] | 493 | ++mCursor; |
---|
[4310] | 494 | return makeComplement(makeWordSet()); |
---|
[4829] | 495 | case 'Q': |
---|
| 496 | complemented = true; |
---|
| 497 | case 'q': |
---|
| 498 | if (*++mCursor != '{') { |
---|
[5752] | 499 | ParseFailure("Malformed grapheme cluster expression"); |
---|
[4829] | 500 | } |
---|
| 501 | ++mCursor; |
---|
[5161] | 502 | ParseFailure("Literal grapheme cluster expressions not yet supported."); |
---|
[4829] | 503 | if (*mCursor != '}') { |
---|
[5752] | 504 | ParseFailure("Malformed grapheme cluster expression"); |
---|
[4829] | 505 | } |
---|
| 506 | ++mCursor; |
---|
| 507 | return complemented ? makeComplement(re) : re; |
---|
[4182] | 508 | case 'P': |
---|
[4309] | 509 | complemented = true; |
---|
[4182] | 510 | case 'p': |
---|
[4829] | 511 | if (*++mCursor != '{') { |
---|
[5161] | 512 | ParseFailure("Malformed property expression"); |
---|
[4829] | 513 | } |
---|
| 514 | ++mCursor; |
---|
| 515 | re = parsePropertyExpression(); |
---|
| 516 | if (*mCursor != '}') { |
---|
[5161] | 517 | ParseFailure("Malformed property expression"); |
---|
[4829] | 518 | } |
---|
| 519 | ++mCursor; |
---|
| 520 | return complemented ? makeComplement(re) : re; |
---|
| 521 | case 'X': |
---|
| 522 | // \X is equivalent to ".+?\b{g}"; proceed the minimal number of characters (but at least one) |
---|
[4830] | 523 | // to get to the next extended grapheme cluster boundary. |
---|
[4829] | 524 | ++mCursor; |
---|
[5091] | 525 | return makeSeq({makeAny(), makeRep(makeSeq({makeZeroWidth("NonGCB"), makeAny()}), 0, Rep::UNBOUNDED_REP), makeZeroWidth("GCB")}); |
---|
[4798] | 526 | case 'N': |
---|
[4829] | 527 | if (*++mCursor != '{') { |
---|
[5161] | 528 | ParseFailure("Malformed \\N expression"); |
---|
[4829] | 529 | } |
---|
| 530 | ++mCursor; |
---|
| 531 | re = parseNamePatternExpression(); |
---|
| 532 | if (*mCursor != '}') { |
---|
[5161] | 533 | ParseFailure("Malformed \\N expression"); |
---|
[4829] | 534 | } |
---|
| 535 | ++mCursor; |
---|
[4852] | 536 | assert (re); |
---|
[4829] | 537 | return re; |
---|
[5132] | 538 | case '<': |
---|
| 539 | ++mCursor; |
---|
| 540 | return makeWordBegin(); |
---|
| 541 | case '>': |
---|
| 542 | ++mCursor; |
---|
| 543 | return makeWordEnd(); |
---|
[4310] | 544 | default: |
---|
[5161] | 545 | ParseFailure("Internal error"); |
---|
[3850] | 546 | } |
---|
| 547 | } |
---|
[5161] | 548 | |
---|
| 549 | void InvalidUTF8Encoding() { |
---|
| 550 | RE_Parser::ParseFailure("Invalid UTF-8 encoding!"); |
---|
| 551 | } |
---|
[3850] | 552 | |
---|
[5554] | 553 | codepoint_t RE_Parser::parse_literal_codepoint() { |
---|
| 554 | if (fByteMode) { |
---|
| 555 | return static_cast<uint8_t>(*mCursor++); |
---|
| 556 | } |
---|
| 557 | else return parse_utf8_codepoint(); |
---|
| 558 | } |
---|
| 559 | |
---|
[4311] | 560 | codepoint_t RE_Parser::parse_utf8_codepoint() { |
---|
[4332] | 561 | // Must cast to unsigned char to avoid sign extension. |
---|
[4829] | 562 | unsigned char pfx = static_cast<unsigned char>(*mCursor++); |
---|
[4333] | 563 | codepoint_t cp = pfx; |
---|
| 564 | if (pfx < 0x80) return cp; |
---|
[4829] | 565 | unsigned suffix_bytes = 0; |
---|
[4333] | 566 | if (pfx < 0xE0) { |
---|
| 567 | if (pfx < 0xC2) { // bare suffix or illegal prefix 0xC0 or 0xC2 |
---|
[5161] | 568 | InvalidUTF8Encoding(); |
---|
[3934] | 569 | } |
---|
[4333] | 570 | suffix_bytes = 1; |
---|
| 571 | cp &= 0x1F; |
---|
[4829] | 572 | } else if (pfx < 0xF0) { // [0xE0, 0xEF] |
---|
[4333] | 573 | cp &= 0x0F; |
---|
| 574 | suffix_bytes = 2; |
---|
[4829] | 575 | } else { // [0xF0, 0xFF] |
---|
[4333] | 576 | cp &= 0x0F; |
---|
| 577 | suffix_bytes = 3; |
---|
| 578 | } |
---|
| 579 | while (suffix_bytes--) { |
---|
[4829] | 580 | if (mCursor.noMore()) { |
---|
[5161] | 581 | InvalidUTF8Encoding(); |
---|
[3934] | 582 | } |
---|
[4829] | 583 | char_t sfx = *mCursor++; |
---|
[4333] | 584 | if ((sfx & 0xC0) != 0x80) { |
---|
[5161] | 585 | InvalidUTF8Encoding(); |
---|
[4333] | 586 | } |
---|
| 587 | cp = (cp << 6) | (sfx & 0x3F); |
---|
[3934] | 588 | } |
---|
[4333] | 589 | // It is an error if a 3-byte sequence is used to encode a codepoint < 0x800 |
---|
| 590 | // or a 4-byte sequence is used to encode a codepoint < 0x10000. |
---|
| 591 | if ((pfx == 0xE0 && cp < 0x800) || (pfx == 0xF0 && cp < 0x10000)) { |
---|
[5161] | 592 | InvalidUTF8Encoding(); |
---|
[4333] | 593 | } |
---|
[4798] | 594 | // It is an error if a 4-byte sequence is used to encode a codepoint |
---|
| 595 | // above the Unicode maximum. |
---|
[4812] | 596 | if (cp > UCD::UNICODE_MAX) { |
---|
[5161] | 597 | InvalidUTF8Encoding(); |
---|
[4194] | 598 | } |
---|
[4332] | 599 | return cp; |
---|
[3934] | 600 | } |
---|
| 601 | |
---|
[4671] | 602 | std::string RE_Parser::canonicalize(const cursor_t begin, const cursor_t end) { |
---|
| 603 | std::locale loc; |
---|
[4829] | 604 | std::stringstream s; |
---|
[4671] | 605 | for (auto i = begin; i != end; ++i) { |
---|
| 606 | switch (*i) { |
---|
| 607 | case '_': case ' ': case '-': |
---|
| 608 | break; |
---|
| 609 | default: |
---|
| 610 | s << std::tolower(*i, loc); |
---|
| 611 | } |
---|
| 612 | } |
---|
| 613 | return s.str(); |
---|
| 614 | } |
---|
| 615 | |
---|
[5206] | 616 | bool RE_Parser::isCharAhead(char c) { |
---|
| 617 | if (mCursor.remaining() < 2) { |
---|
| 618 | return false; |
---|
| 619 | } |
---|
| 620 | auto nextCursor = mCursor.pos() + 1; |
---|
| 621 | return *nextCursor == c; |
---|
| 622 | } |
---|
| 623 | |
---|
[4819] | 624 | RE * RE_Parser::parsePropertyExpression() { |
---|
[4829] | 625 | const auto start = mCursor.pos(); |
---|
| 626 | while (mCursor.more()) { |
---|
| 627 | bool done = false; |
---|
| 628 | switch (*mCursor) { |
---|
| 629 | case '}': case ':': case '=': |
---|
| 630 | done = true; |
---|
| 631 | } |
---|
| 632 | if (done) { |
---|
| 633 | break; |
---|
| 634 | } |
---|
| 635 | ++mCursor; |
---|
[4310] | 636 | } |
---|
[4829] | 637 | if (*mCursor == '=') { |
---|
[5206] | 638 | // We have a property-name = value expression |
---|
[4829] | 639 | const auto prop_end = mCursor.pos(); |
---|
| 640 | mCursor++; |
---|
[5206] | 641 | auto val_start = mCursor.pos(); |
---|
[5683] | 642 | if (*val_start == '/') { |
---|
| 643 | // property-value is another regex |
---|
| 644 | auto previous = val_start; |
---|
| 645 | auto current = (++mCursor).pos(); |
---|
| 646 | val_start = current; |
---|
| 647 | |
---|
| 648 | while (true) { |
---|
| 649 | if (*current == '/' && *previous != '\\') { |
---|
[5206] | 650 | break; |
---|
| 651 | } |
---|
[5683] | 652 | |
---|
| 653 | if (!mCursor.more()) { |
---|
| 654 | ParseFailure("Malformed property expression"); |
---|
| 655 | } |
---|
| 656 | |
---|
| 657 | previous = current; |
---|
| 658 | current = (++mCursor).pos(); |
---|
[4829] | 659 | } |
---|
[5683] | 660 | ++mCursor; |
---|
| 661 | //return parseRegexPropertyValue(canonicalize(start, prop_end), std::string(val_start, current)); |
---|
| 662 | return createName(canonicalize(start, prop_end), std::string(val_start-1, current)); |
---|
| 663 | } |
---|
| 664 | if (*val_start == '@') { |
---|
| 665 | // property-value is @property@ or @identity@ |
---|
[5206] | 666 | auto previous = val_start; |
---|
| 667 | auto current = (++mCursor).pos(); |
---|
| 668 | val_start = current; |
---|
[5683] | 669 | |
---|
[5206] | 670 | while (true) { |
---|
[5683] | 671 | if (*current == '@' && *previous != '\\') { |
---|
[5206] | 672 | break; |
---|
| 673 | } |
---|
[5683] | 674 | |
---|
[5206] | 675 | if (!mCursor.more()) { |
---|
| 676 | ParseFailure("Malformed property expression"); |
---|
| 677 | } |
---|
[5683] | 678 | |
---|
[5206] | 679 | previous = current; |
---|
| 680 | current = (++mCursor).pos(); |
---|
[4829] | 681 | } |
---|
| 682 | ++mCursor; |
---|
[5679] | 683 | //return parseRegexPropertyValue(canonicalize(start, prop_end), std::string(val_start, current)); |
---|
| 684 | return createName(canonicalize(start, prop_end), std::string(val_start-1, current)); |
---|
[4377] | 685 | } |
---|
[5683] | 686 | else { |
---|
| 687 | // property-value is normal string |
---|
| 688 | while (mCursor.more()) { |
---|
| 689 | bool done = false; |
---|
| 690 | switch (*mCursor) { |
---|
| 691 | case '}': case ':': |
---|
| 692 | done = true; |
---|
| 693 | } |
---|
| 694 | if (done) { |
---|
| 695 | break; |
---|
| 696 | } |
---|
| 697 | ++mCursor; |
---|
| 698 | } |
---|
| 699 | return createName(canonicalize(start, prop_end), std::string(val_start, mCursor.pos())); |
---|
| 700 | } |
---|
[4377] | 701 | } |
---|
[5037] | 702 | return createName(canonicalize(start, mCursor.pos())); |
---|
[3935] | 703 | } |
---|
[4671] | 704 | |
---|
[4939] | 705 | Name * RE_Parser::parseNamePatternExpression(){ |
---|
[4798] | 706 | |
---|
[5679] | 707 | const auto start = mCursor.pos(); |
---|
| 708 | while (mCursor.more()) { |
---|
| 709 | if (*mCursor == '\\') { |
---|
| 710 | ++mCursor; |
---|
| 711 | if (!mCursor.more()) { |
---|
| 712 | break; |
---|
| 713 | } |
---|
| 714 | } |
---|
| 715 | else if (*mCursor == '}') { |
---|
| 716 | break; |
---|
| 717 | } |
---|
| 718 | ++mCursor; |
---|
[4939] | 719 | } |
---|
[5679] | 720 | std::string nameRegexp = "/(?i)" + std::string(start, mCursor.pos()); |
---|
| 721 | return createName("na", nameRegexp); |
---|
[4939] | 722 | } |
---|
| 723 | |
---|
[5182] | 724 | bool RE_Parser::isUnsupportChartsetOperator(char c) { |
---|
[5180] | 725 | return false; |
---|
| 726 | } |
---|
| 727 | |
---|
[4309] | 728 | CharsetOperatorKind RE_Parser::getCharsetOperator() { |
---|
[5180] | 729 | if (isUnsupportChartsetOperator(*mCursor)) { |
---|
| 730 | return emptyOperator; |
---|
| 731 | } |
---|
[4829] | 732 | switch (*mCursor) { |
---|
[4310] | 733 | case '&': |
---|
[4829] | 734 | ++mCursor; |
---|
[4831] | 735 | if (*mCursor == '&') { |
---|
[4829] | 736 | ++mCursor; |
---|
[4310] | 737 | return intersectOp; |
---|
[4831] | 738 | } else if (*mCursor == '[') { |
---|
[4310] | 739 | // Short-hand for intersectOp when a set follows |
---|
| 740 | return intersectOp; |
---|
| 741 | } |
---|
[4831] | 742 | return ampChar; |
---|
[4310] | 743 | case '-': |
---|
[4829] | 744 | ++mCursor; |
---|
[4831] | 745 | if (*mCursor == '-') { |
---|
[4829] | 746 | ++mCursor; |
---|
[4310] | 747 | return setDiffOp; |
---|
[4831] | 748 | } else if (*mCursor == '[') { |
---|
[4310] | 749 | return setDiffOp; |
---|
[4831] | 750 | } else if (*mCursor == ']') { |
---|
[4310] | 751 | return hyphenChar; |
---|
| 752 | } |
---|
[4831] | 753 | return rangeHyphen; |
---|
[4310] | 754 | case '[': |
---|
[4829] | 755 | ++mCursor; |
---|
[4831] | 756 | if (*mCursor == ':') { |
---|
[4829] | 757 | ++mCursor; |
---|
[4310] | 758 | return posixPropertyOpener; |
---|
| 759 | } |
---|
[4831] | 760 | return setOpener; |
---|
[4310] | 761 | case ']': |
---|
[4829] | 762 | ++mCursor; |
---|
[4310] | 763 | return setCloser; |
---|
| 764 | case '\\': |
---|
[4829] | 765 | ++mCursor; |
---|
[4310] | 766 | return backSlash; |
---|
| 767 | default: |
---|
| 768 | return emptyOperator; |
---|
| 769 | } |
---|
[4798] | 770 | } |
---|
| 771 | |
---|
[4309] | 772 | // Precondition: cursor is immediately after opening '[' character |
---|
[4182] | 773 | RE * RE_Parser::parse_charset() { |
---|
[4310] | 774 | // Sets are accumulated using two variables: |
---|
| 775 | // subexprs accumulates set expressions such as \p{Lu}, [\w && \p{Greek}] |
---|
[4798] | 776 | // cc accumulates the literal and calculated codepoints and ranges |
---|
[4310] | 777 | std::vector<RE *> subexprs; |
---|
[4272] | 778 | CC * cc = makeCC(); |
---|
[4310] | 779 | // When the last item deal with is a single literal charcacter or calculated codepoint, |
---|
| 780 | // a following hyphen can indicate a range. When the last item is a set subexpression, |
---|
| 781 | // a following hyphen can indicate set subtraction. |
---|
| 782 | enum {NoItem, CodepointItem, RangeItem, SetItem, BrackettedSetItem} lastItemKind = NoItem; |
---|
[4798] | 783 | |
---|
[5037] | 784 | codepoint_t lastCodepointItem = 0; |
---|
[4310] | 785 | bool havePendingOperation = false; |
---|
[5558] | 786 | bool possibleByteCodeEscape = false; // set to true when \x, \o or \0 hex or octal escapes seen. |
---|
[5037] | 787 | CharsetOperatorKind pendingOperationKind = intersectOp; |
---|
| 788 | RE * pendingOperand = nullptr; |
---|
[4798] | 789 | |
---|
[4309] | 790 | // If the first character after the [ is a ^ (caret) then the matching character class is complemented. |
---|
[4182] | 791 | bool negated = false; |
---|
[4829] | 792 | if (*mCursor == '^') { |
---|
[4516] | 793 | negated = true; |
---|
[4829] | 794 | ++mCursor; |
---|
[4309] | 795 | } |
---|
[4310] | 796 | // Legacy rule: an unescaped ] may appear as a literal set character |
---|
| 797 | // if and only if it appears immediately after the opening [ or [^ |
---|
[4829] | 798 | if ( *mCursor == ']' && LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) { |
---|
[4819] | 799 | insert(cc, ']'); |
---|
[4310] | 800 | lastItemKind = CodepointItem; |
---|
[4311] | 801 | lastCodepointItem = static_cast<codepoint_t> (']'); |
---|
[4829] | 802 | ++mCursor; |
---|
| 803 | } else if ( *mCursor == '-' && LEGACY_UNESCAPED_HYPHEN_ALLOWED) { |
---|
| 804 | ++mCursor; |
---|
[4819] | 805 | insert(cc, '-'); |
---|
[4310] | 806 | lastItemKind = CodepointItem; |
---|
[4311] | 807 | lastCodepointItem = static_cast<codepoint_t> ('-'); |
---|
[4829] | 808 | if (*mCursor == '-') { |
---|
[5161] | 809 | ParseFailure("Set operator has no left operand."); |
---|
[4516] | 810 | } |
---|
[4309] | 811 | } |
---|
[4829] | 812 | while (mCursor.more()) { |
---|
[4622] | 813 | const CharsetOperatorKind op = getCharsetOperator(); |
---|
[4310] | 814 | switch (op) { |
---|
[4516] | 815 | case intersectOp: |
---|
| 816 | case setDiffOp: { |
---|
| 817 | if (lastItemKind == NoItem) { |
---|
[5161] | 818 | ParseFailure("Set operator has no left operand."); |
---|
[4516] | 819 | } |
---|
[4819] | 820 | if (!cc->empty()) { |
---|
| 821 | subexprs.push_back(mMemoizer.memoize(cc)); |
---|
[4516] | 822 | } |
---|
[4310] | 823 | RE * newOperand = makeAlt(subexprs.begin(), subexprs.end()); |
---|
| 824 | subexprs.clear(); |
---|
| 825 | cc = makeCC(); |
---|
| 826 | if (havePendingOperation) { |
---|
| 827 | if (pendingOperationKind == intersectOp) { |
---|
| 828 | pendingOperand = makeIntersect(pendingOperand, newOperand); |
---|
| 829 | } |
---|
| 830 | else { |
---|
| 831 | pendingOperand = makeDiff(pendingOperand, newOperand); |
---|
| 832 | } |
---|
| 833 | } |
---|
| 834 | else { |
---|
| 835 | pendingOperand = newOperand; |
---|
| 836 | } |
---|
| 837 | havePendingOperation = true; |
---|
| 838 | pendingOperationKind = op; |
---|
| 839 | lastItemKind = NoItem; |
---|
| 840 | } |
---|
| 841 | break; |
---|
| 842 | case setCloser: { |
---|
[4516] | 843 | if (lastItemKind == NoItem) { |
---|
[5161] | 844 | ParseFailure("Set operator has no right operand."); |
---|
[4516] | 845 | } |
---|
[4819] | 846 | if (!cc->empty()) { |
---|
[5558] | 847 | if (possibleByteCodeEscape && (cc->max_codepoint() <= 0xFF) && subexprs.empty() && !havePendingOperation) { |
---|
| 848 | subexprs.push_back(makeByte(cc)); |
---|
| 849 | } |
---|
| 850 | else { |
---|
| 851 | subexprs.push_back(mMemoizer.memoize(cc)); |
---|
| 852 | } |
---|
[4320] | 853 | } |
---|
[4310] | 854 | RE * newOperand = makeAlt(subexprs.begin(), subexprs.end()); |
---|
| 855 | if (havePendingOperation) { |
---|
| 856 | if (pendingOperationKind == intersectOp) { |
---|
| 857 | newOperand = makeIntersect(pendingOperand, newOperand); |
---|
| 858 | } |
---|
| 859 | else { |
---|
| 860 | newOperand = makeDiff(pendingOperand, newOperand); |
---|
| 861 | } |
---|
| 862 | } |
---|
[4516] | 863 | return negated ? makeComplement(newOperand) : newOperand; |
---|
[4310] | 864 | } |
---|
[4516] | 865 | case setOpener: |
---|
| 866 | case posixPropertyOpener: { |
---|
[4310] | 867 | if (lastItemKind != NoItem) { |
---|
[4819] | 868 | if (!cc->empty()) { |
---|
| 869 | subexprs.push_back(mMemoizer.memoize(cc)); |
---|
| 870 | } |
---|
[4310] | 871 | RE * newOperand = makeAlt(subexprs.begin(), subexprs.end()); |
---|
| 872 | subexprs.clear(); |
---|
| 873 | cc = makeCC(); |
---|
| 874 | if (havePendingOperation) { |
---|
| 875 | if (pendingOperationKind == intersectOp) { |
---|
| 876 | pendingOperand = makeIntersect(pendingOperand, newOperand); |
---|
[4829] | 877 | } else { |
---|
[4310] | 878 | pendingOperand = makeDiff(pendingOperand, newOperand); |
---|
| 879 | } |
---|
| 880 | } |
---|
| 881 | else { |
---|
| 882 | pendingOperand = newOperand; |
---|
| 883 | } |
---|
| 884 | subexprs.push_back(pendingOperand); |
---|
| 885 | havePendingOperation = false; |
---|
| 886 | } |
---|
| 887 | if (op == setOpener) { |
---|
| 888 | subexprs.push_back(parse_charset()); |
---|
| 889 | lastItemKind = SetItem; |
---|
| 890 | } |
---|
| 891 | else if (op == posixPropertyOpener) { |
---|
| 892 | bool negated = false; |
---|
[4829] | 893 | if (*mCursor == '^') { |
---|
[4310] | 894 | negated = true; |
---|
[4829] | 895 | mCursor++; |
---|
[4310] | 896 | } |
---|
[4671] | 897 | RE * posixSet = parsePropertyExpression(); |
---|
[4829] | 898 | subexprs.push_back(negated ? makeComplement(posixSet) : posixSet); |
---|
[4310] | 899 | lastItemKind = BrackettedSetItem; |
---|
[4829] | 900 | if (*mCursor++ != ':' || *mCursor++ != ']') |
---|
[5161] | 901 | ParseFailure("Posix set expression improperly terminated."); |
---|
[4310] | 902 | } |
---|
| 903 | } |
---|
| 904 | break; |
---|
| 905 | case rangeHyphen: |
---|
[4516] | 906 | if (lastItemKind != CodepointItem) { |
---|
[5161] | 907 | ParseFailure("Range operator - has illegal left operand."); |
---|
[4516] | 908 | } |
---|
[5558] | 909 | if (*mCursor == '\\') { |
---|
| 910 | mCursor++; |
---|
| 911 | if ((*mCursor == 'x') || (*mCursor == 'o') || (*mCursor == '0')) possibleByteCodeEscape = true; |
---|
| 912 | insert_range(cc, lastCodepointItem, parse_escaped_codepoint()); |
---|
[5769] | 913 | //subexprs.push_back(makeRange(makeCC(lastCodepointItem), makeCC(parse_escaped_codepoint()))); |
---|
[5558] | 914 | } else { |
---|
| 915 | insert_range(cc, lastCodepointItem, parse_literal_codepoint()); |
---|
[5769] | 916 | //subexprs.push_back(makeRange(makeCC(lastCodepointItem), makeCC(parse_literal_codepoint()))); |
---|
[5558] | 917 | } |
---|
[4310] | 918 | lastItemKind = RangeItem; |
---|
| 919 | break; |
---|
| 920 | case hyphenChar: |
---|
[4819] | 921 | insert(cc, '-'); |
---|
[4310] | 922 | lastItemKind = CodepointItem; |
---|
[4311] | 923 | lastCodepointItem = static_cast<codepoint_t> ('-'); |
---|
[4310] | 924 | break; |
---|
| 925 | case ampChar: |
---|
[4819] | 926 | insert(cc, '&'); |
---|
[4310] | 927 | lastItemKind = CodepointItem; |
---|
[4311] | 928 | lastCodepointItem = static_cast<codepoint_t> ('&'); |
---|
[4310] | 929 | break; |
---|
| 930 | case backSlash: |
---|
[4829] | 931 | if (isSetEscapeChar(*mCursor)) { |
---|
[4671] | 932 | subexprs.push_back(parseEscapedSet()); |
---|
[4310] | 933 | lastItemKind = SetItem; |
---|
| 934 | } |
---|
| 935 | else { |
---|
[5558] | 936 | if ((*mCursor == 'x') || (*mCursor == 'o') || (*mCursor == '0')) possibleByteCodeEscape = true; |
---|
[4310] | 937 | lastCodepointItem = parse_escaped_codepoint(); |
---|
[4819] | 938 | insert(cc, lastCodepointItem); |
---|
[4310] | 939 | lastItemKind = CodepointItem; |
---|
| 940 | } |
---|
| 941 | break; |
---|
| 942 | case emptyOperator: |
---|
[5554] | 943 | lastCodepointItem = parse_literal_codepoint(); |
---|
[4819] | 944 | insert(cc, lastCodepointItem); |
---|
[4310] | 945 | lastItemKind = CodepointItem; |
---|
| 946 | break; |
---|
| 947 | } |
---|
| 948 | } |
---|
[5161] | 949 | ParseFailure("Set expression not properly terminated."); |
---|
[3850] | 950 | } |
---|
| 951 | |
---|
[4309] | 952 | |
---|
[4305] | 953 | // A backslash escape was found, and various special cases (back reference, |
---|
[4402] | 954 | // quoting with \Q, \E, sets (\p, \P, \d, \D, \w, \W, \s, \S, \b, \B), grapheme |
---|
[4305] | 955 | // cluster \X have been ruled out. |
---|
| 956 | // It may be one of several possibilities or an error sequence. |
---|
[4402] | 957 | // 1. Special control codes (\a, \e, \f, \n, \r, \t, \v) |
---|
[4305] | 958 | // 2. General control codes c[@-_a-z?] |
---|
| 959 | // 3. Restricted octal notation 0 - 0777 |
---|
| 960 | // 4. General octal notation o\{[0-7]+\} |
---|
| 961 | // 5. General hex notation x\{[0-9A-Fa-f]+\} |
---|
[4798] | 962 | // 6. An error for any unrecognized alphabetic escape |
---|
[4305] | 963 | // 7. An escaped ASCII symbol, standing for itself |
---|
| 964 | |
---|
[4311] | 965 | codepoint_t RE_Parser::parse_escaped_codepoint() { |
---|
| 966 | codepoint_t cp_value; |
---|
[4829] | 967 | switch (*mCursor) { |
---|
| 968 | case 'a': ++mCursor; return 0x07; // BEL |
---|
| 969 | case 'e': ++mCursor; return 0x1B; // ESC |
---|
| 970 | case 'f': ++mCursor; return 0x0C; // FF |
---|
| 971 | case 'n': ++mCursor; return 0x0A; // LF |
---|
| 972 | case 'r': ++mCursor; return 0x0D; // CR |
---|
| 973 | case 't': ++mCursor; return 0x09; // HT |
---|
| 974 | case 'v': ++mCursor; return 0x0B; // VT |
---|
[4310] | 975 | case 'c': // Control-escape based on next char |
---|
[4829] | 976 | ++mCursor; |
---|
[4310] | 977 | // \c@, \cA, ... \c_, or \ca, ..., \cz |
---|
[4829] | 978 | if (((*mCursor >= '@') && (*mCursor <= '_')) || ((*mCursor >= 'a') && (*mCursor <= 'z'))) { |
---|
| 979 | cp_value = static_cast<codepoint_t>(*mCursor & 0x1F); |
---|
| 980 | mCursor++; |
---|
[4310] | 981 | return cp_value; |
---|
| 982 | } |
---|
[4829] | 983 | else if (*mCursor++ == '?') return 0x7F; // \c? ==> DEL |
---|
[5161] | 984 | else ParseFailure("Illegal \\c escape sequence"); |
---|
[4310] | 985 | case '0': // Octal escape: 0 - 0377 |
---|
[4829] | 986 | ++mCursor; |
---|
[4310] | 987 | return parse_octal_codepoint(0,3); |
---|
[4798] | 988 | case 'o': |
---|
[4829] | 989 | ++mCursor; |
---|
| 990 | if (*mCursor == '{') { |
---|
| 991 | ++mCursor; |
---|
[4310] | 992 | cp_value = parse_octal_codepoint(1, 7); |
---|
[5161] | 993 | if (*mCursor++ != '}') ParseFailure("Malformed octal escape sequence"); |
---|
[4310] | 994 | return cp_value; |
---|
| 995 | } |
---|
| 996 | else { |
---|
[5161] | 997 | ParseFailure("Malformed octal escape sequence"); |
---|
[4310] | 998 | } |
---|
[4798] | 999 | case 'x': |
---|
[4829] | 1000 | ++mCursor; |
---|
| 1001 | if (*mCursor == '{') { |
---|
| 1002 | ++mCursor; |
---|
[4310] | 1003 | cp_value = parse_hex_codepoint(1, 6); |
---|
[5161] | 1004 | if (*mCursor++ != '}') ParseFailure("Malformed hex escape sequence"); |
---|
[4310] | 1005 | return cp_value; |
---|
| 1006 | } |
---|
| 1007 | else { |
---|
| 1008 | return parse_hex_codepoint(1,2); // ICU compatibility |
---|
| 1009 | } |
---|
| 1010 | case 'u': |
---|
[4829] | 1011 | ++mCursor; |
---|
| 1012 | if (*mCursor == '{') { |
---|
| 1013 | ++mCursor; |
---|
[4310] | 1014 | cp_value = parse_hex_codepoint(1, 6); |
---|
[5161] | 1015 | if (*mCursor++ != '}') ParseFailure("Malformed hex escape sequence"); |
---|
[4310] | 1016 | return cp_value; |
---|
| 1017 | } |
---|
| 1018 | else { |
---|
| 1019 | return parse_hex_codepoint(4,4); // ICU compatibility |
---|
| 1020 | } |
---|
[4798] | 1021 | case 'U': |
---|
[4829] | 1022 | ++mCursor; |
---|
[4310] | 1023 | return parse_hex_codepoint(8,8); // ICU compatibility |
---|
| 1024 | default: |
---|
| 1025 | // Escaped letters should be reserved for special functions. |
---|
[5180] | 1026 | if (((*mCursor >= 'A') && (*mCursor <= 'Z')) || ((*mCursor >= 'a') && (*mCursor <= 'z'))){ |
---|
[5554] | 1027 | //Escape unknown letter will be parse as normal letter |
---|
| 1028 | return parse_literal_codepoint(); |
---|
[5180] | 1029 | //ParseFailure("Undefined or unsupported escape sequence"); |
---|
| 1030 | } |
---|
[4829] | 1031 | else if ((*mCursor < 0x20) || (*mCursor >= 0x7F)) |
---|
[5161] | 1032 | ParseFailure("Illegal escape sequence"); |
---|
[4829] | 1033 | else return static_cast<codepoint_t>(*mCursor++); |
---|
[4310] | 1034 | } |
---|
[3850] | 1035 | } |
---|
| 1036 | |
---|
[4311] | 1037 | codepoint_t RE_Parser::parse_octal_codepoint(int mindigits, int maxdigits) { |
---|
| 1038 | codepoint_t value = 0; |
---|
[4310] | 1039 | int count = 0; |
---|
[4829] | 1040 | while (mCursor.more() && count < maxdigits) { |
---|
| 1041 | const char t = *mCursor; |
---|
[4310] | 1042 | if (t < '0' || t > '7') { |
---|
| 1043 | break; |
---|
| 1044 | } |
---|
| 1045 | value = value * 8 | (t - '0'); |
---|
[4829] | 1046 | ++mCursor; |
---|
[4310] | 1047 | ++count; |
---|
| 1048 | } |
---|
[5161] | 1049 | if (count < mindigits) ParseFailure("Octal sequence has too few digits"); |
---|
| 1050 | if (value > UCD::UNICODE_MAX) ParseFailure("Octal value too large"); |
---|
[4310] | 1051 | return value; |
---|
[4305] | 1052 | } |
---|
| 1053 | |
---|
[4311] | 1054 | codepoint_t RE_Parser::parse_hex_codepoint(int mindigits, int maxdigits) { |
---|
| 1055 | codepoint_t value = 0; |
---|
[4310] | 1056 | int count = 0; |
---|
[4829] | 1057 | while (mCursor.more() && isxdigit(*mCursor) && count < maxdigits) { |
---|
| 1058 | const char t = *mCursor; |
---|
[4310] | 1059 | if (isdigit(t)) { |
---|
| 1060 | value = (value * 16) | (t - '0'); |
---|
| 1061 | } |
---|
[4798] | 1062 | else { |
---|
[5037] | 1063 | value = ((value * 16) | ((t | 32) - 'a')) + 10; |
---|
[4310] | 1064 | } |
---|
[4829] | 1065 | ++mCursor; |
---|
[4310] | 1066 | ++count; |
---|
| 1067 | } |
---|
[5161] | 1068 | if (count < mindigits) ParseFailure("Hexadecimal sequence has too few digits"); |
---|
| 1069 | if (value > UCD::UNICODE_MAX) ParseFailure("Hexadecimal value too large"); |
---|
[4310] | 1070 | return value; |
---|
[4305] | 1071 | } |
---|
| 1072 | |
---|
[5181] | 1073 | Name * RE_Parser::createCC(const codepoint_t cp) { |
---|
[5769] | 1074 | CC * cc = makeCC(cp); |
---|
[4819] | 1075 | return mMemoizer.memoize(cc); |
---|
[4194] | 1076 | } |
---|
[4316] | 1077 | |
---|
[5182] | 1078 | void RE_Parser::insert(CC * cc, const codepoint_t cp) { |
---|
[5769] | 1079 | cc->insert(cp); |
---|
[4316] | 1080 | } |
---|
| 1081 | |
---|
[5182] | 1082 | void RE_Parser::insert_range(CC * cc, const codepoint_t lo, const codepoint_t hi) { |
---|
[5769] | 1083 | cc->insert_range(lo, hi); |
---|
[4316] | 1084 | } |
---|
[4798] | 1085 | |
---|
[4671] | 1086 | RE * RE_Parser::makeComplement(RE * s) { |
---|
| 1087 | return makeDiff(makeAny(), s); |
---|
[4316] | 1088 | } |
---|
[4671] | 1089 | |
---|
[4830] | 1090 | |
---|
[5206] | 1091 | |
---|
[4830] | 1092 | |
---|
[4829] | 1093 | RE * RE_Parser::makeWordBoundary() { |
---|
| 1094 | Name * wordC = makeWordSet(); |
---|
[5206] | 1095 | return makeReBoundary(wordC); |
---|
[4671] | 1096 | } |
---|
| 1097 | |
---|
[4829] | 1098 | RE * RE_Parser::makeWordNonBoundary() { |
---|
| 1099 | Name * wordC = makeWordSet(); |
---|
[5206] | 1100 | return makeReNonBoundary(wordC); |
---|
[4671] | 1101 | } |
---|
| 1102 | |
---|
[5206] | 1103 | inline RE * RE_Parser::makeReBoundary(RE * re) { |
---|
[5308] | 1104 | return makeBoundaryAssertion(re); |
---|
[5206] | 1105 | } |
---|
| 1106 | inline RE * RE_Parser::makeReNonBoundary(RE * re) { |
---|
[5308] | 1107 | return makeNegativeBoundaryAssertion(re); |
---|
[5206] | 1108 | } |
---|
| 1109 | |
---|
[5132] | 1110 | RE * RE_Parser::makeWordBegin() { |
---|
| 1111 | Name * wordC = makeWordSet(); |
---|
| 1112 | return makeNegativeLookBehindAssertion(wordC); |
---|
| 1113 | } |
---|
| 1114 | |
---|
| 1115 | RE * RE_Parser::makeWordEnd() { |
---|
| 1116 | Name * wordC = makeWordSet(); |
---|
| 1117 | return makeNegativeLookAheadAssertion(wordC); |
---|
| 1118 | } |
---|
| 1119 | |
---|
[5182] | 1120 | Name * RE_Parser::makeDigitSet() { |
---|
[4829] | 1121 | return mMemoizer.memoize(createName("nd")); |
---|
[4671] | 1122 | } |
---|
| 1123 | |
---|
[5182] | 1124 | Name * RE_Parser::makeAlphaNumeric() { |
---|
[4829] | 1125 | return mMemoizer.memoize(createName("alnum")); |
---|
[4671] | 1126 | } |
---|
| 1127 | |
---|
[5182] | 1128 | Name * RE_Parser::makeWhitespaceSet() { |
---|
[4829] | 1129 | return mMemoizer.memoize(createName("whitespace")); |
---|
[4671] | 1130 | } |
---|
| 1131 | |
---|
[5182] | 1132 | Name * RE_Parser::makeWordSet() { |
---|
[4829] | 1133 | return mMemoizer.memoize(createName("word")); |
---|
[4671] | 1134 | } |
---|
| 1135 | |
---|
[5241] | 1136 | Name * RE_Parser::createName(std::string value) { |
---|
[4819] | 1137 | auto key = std::make_pair("", value); |
---|
| 1138 | auto f = mNameMap.find(key); |
---|
| 1139 | if (f != mNameMap.end()) { |
---|
| 1140 | return f->second; |
---|
| 1141 | } |
---|
[4829] | 1142 | Name * const property = mMemoizer.memoize(makeName(value, Name::Type::UnicodeProperty)); |
---|
[4819] | 1143 | mNameMap.insert(std::make_pair(std::move(key), property)); |
---|
| 1144 | return property; |
---|
[5080] | 1145 | } |
---|
[4819] | 1146 | |
---|
[5241] | 1147 | Name * RE_Parser::createName(std::string prop, std::string value) { |
---|
[4819] | 1148 | auto key = std::make_pair(prop, value); |
---|
| 1149 | auto f = mNameMap.find(key); |
---|
| 1150 | if (f != mNameMap.end()) { |
---|
| 1151 | return f->second; |
---|
| 1152 | } |
---|
[4829] | 1153 | Name * const property = mMemoizer.memoize(makeName(prop, value, Name::Type::UnicodeProperty)); |
---|
[4819] | 1154 | mNameMap.insert(std::make_pair(std::move(key), property)); |
---|
| 1155 | return property; |
---|
| 1156 | } |
---|
| 1157 | |
---|
[5267] | 1158 | LLVM_ATTRIBUTE_NORETURN void RE_Parser::ParseFailure(std::string errmsg) { |
---|
| 1159 | llvm::report_fatal_error(errmsg); |
---|
[4819] | 1160 | } |
---|
[5267] | 1161 | |
---|
| 1162 | } |
---|