Changeset 5880
- Timestamp:
- Feb 24, 2018, 9:33:57 AM (12 months ago)
- Location:
- icGREP/icgrep-devel/icgrep
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp
r5872 r5880 1 1 /* 2 * Copyright (c) 201 5International Characters.2 * Copyright (c) 2018 International Characters. 3 3 * This software is licensed to the public under the Open Software License 3.0. 4 4 * icgrep is a trademark of International Characters. … … 17 17 #include <re/re_parser.h> 18 18 #include <re/re_name_resolve.h> 19 #include <re/grapheme_clusters.h> 19 20 #include <re/re_compiler.h> 20 21 #include "UCD/PropertyAliases.h" … … 33 34 llvm::report_fatal_error(errmsg); 34 35 } 35 36 #define Behind(x) makeLookBehindAssertion(x)37 #define Ahead(x) makeLookAheadAssertion(x)38 36 39 37 40 38 RE * UnicodeBreakRE() { 41 39 return makeAlt({makeCC(0x0A, 0x0C), makeSeq({makeCC(0x0D), makeCC(0x0A)}), makeSeq({makeCC(0x0D), makeNegativeLookAheadAssertion(makeCC(0x0A))})}); 42 }43 44 void generateGraphemeClusterBoundaryRule(Name * const &property) {45 // 3.1.1 Grapheme Cluster Boundary Rules46 47 // RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);48 RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);49 RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);50 RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});51 52 // Break at the start and end of text.53 RE * GCB_1 = makeStart();54 RE * GCB_2 = makeEnd();55 // Do not break between a CR and LF.56 RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});57 // Otherwise, break before and after controls.58 RE * GCB_4 = Behind(GCB_Control_CR_LF);59 RE * GCB_5 = Ahead(GCB_Control_CR_LF);60 RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});61 62 RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);63 RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);64 RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);65 RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);66 RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);67 RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);68 // Do not break Hangul syllable sequences.69 RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});70 RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});71 RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});72 // Do not break between regional indicator symbols.73 RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});74 // Do not break before extending characters.75 RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));76 // Do not break before SpacingMarks, or after Prepend characters.77 RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));78 RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));79 RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});80 // Otherwise, break everywhere.81 RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});82 83 //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);84 property->setDefinition(makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)}));85 40 } 86 41 … … 115 70 property->setDefinition(makeDiff(makeAny(), unassigned)); 116 71 return true; 117 } else if (value == "\\b{g}" || value == "\\B{g}") {72 } else if (value == "\\b{g}") { 118 73 generateGraphemeClusterBoundaryRule(property); 119 74 return true; -
icGREP/icgrep-devel/icgrep/UCD/resolve_properties.h
r5872 r5880 16 16 17 17 re::RE * UnicodeBreakRE(); 18 void generateGraphemeClusterBoundaryRule(re::Name * const &property);19 18 bool resolvePropertyDefinition(re::Name * const property); 20 19 std::string resolvePropertyFunction(re::Name * const property); -
icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp
r5796 r5880 15 15 #include <re/re_range.h> 16 16 #include <re/printer_re.h> 17 #include <re/re_name_resolve.h> 17 18 #include <vector> // for vector, allocator 18 19 #include <llvm/Support/Casting.h> // for dyn_cast, isa … … 39 40 if (n->getType() == Name::Type::ZeroWidth) { 40 41 const std::string nameString = n->getName(); 41 return (nameString == "\\b{g}") || (nameString == "\\B{g}");42 return nameString == "\\b{g}"; 42 43 } 43 44 return false; … … 71 72 RE * resolveGraphemeMode(RE * re, bool inGraphemeMode) { 72 73 if (isa<Name>(re)) { 73 if (inGraphemeMode && (cast<Name>(re)->getName() == ".")) 74 return makeSeq({makeAny(), makeRep(makeSeq({makeZeroWidth("\\B{g}"), makeAny()}), 0, Rep::UNBOUNDED_REP), makeZeroWidth("\\b{g}")}); 74 if (inGraphemeMode && (cast<Name>(re)->getName() == ".")) { 75 RE * GCB = makeZeroWidth("\\b{g}"); 76 RE * nonGCB = makeDiff(makeSeq({}), GCB); 77 return makeSeq({makeAny(), makeRep(makeSeq({nonGCB, makeAny()}), 0, Rep::UNBOUNDED_REP), GCB}); 78 } 75 79 else return re; 76 80 } … … 122 126 } 123 127 128 129 #define Behind(x) makeLookBehindAssertion(x) 130 #define Ahead(x) makeLookAheadAssertion(x) 131 132 void generateGraphemeClusterBoundaryRule(Name * const &property) { 133 // 3.1.1 Grapheme Cluster Boundary Rules 134 135 // RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty); 136 RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty); 137 RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty); 138 RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF}); 139 140 // Break at the start and end of text. 141 RE * GCB_1 = makeStart(); 142 RE * GCB_2 = makeEnd(); 143 // Do not break between a CR and LF. 144 RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)}); 145 // Otherwise, break before and after controls. 146 RE * GCB_4 = Behind(GCB_Control_CR_LF); 147 RE * GCB_5 = Ahead(GCB_Control_CR_LF); 148 RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)}); 149 150 RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty); 151 RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty); 152 RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty); 153 RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty); 154 RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty); 155 RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty); 156 // Do not break Hangul syllable sequences. 157 RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))}); 158 RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))}); 159 RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)}); 160 // Do not break between regional indicator symbols. 161 RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)}); 162 // Do not break before extending characters. 163 RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty)); 164 // Do not break before SpacingMarks, or after Prepend characters. 165 RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty)); 166 RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty)); 167 RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b}); 168 // Otherwise, break everywhere. 169 RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())}); 170 171 //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty); 172 RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)}); 173 gcb = resolveUnicodeProperties(gcb); 174 property->setDefinition(gcb); 124 175 } 176 177 } -
icGREP/icgrep-devel/icgrep/re/grapheme_clusters.h
r5772 r5880 5 5 6 6 class RE; 7 7 class Name; 8 8 9 bool hasGraphemeClusterBoundary(const RE * re); 9 10 10 11 RE * resolveGraphemeMode(RE * re, bool inGraphemeMode); 12 13 void generateGraphemeClusterBoundaryRule(Name * const &property); 11 14 12 15 } -
icGREP/icgrep-devel/icgrep/re/re_compiler.cpp
r5872 r5880 160 160 AlignMarkers(marker, zero, pb); 161 161 PabloAST * ze = markerVar(zero); 162 if (nameString == "\\B{g}") {163 ze = pb.createNot(ze);164 }165 162 return makeMarker(markerPos(marker), pb.createAnd(markerVar(marker), ze, "zerowidth")); 166 163 } else { -
icGREP/icgrep-devel/icgrep/re/re_nullable.cpp
r5869 r5880 166 166 return (re_rep->getLB() == 0) || isNullable(re_rep->getRE()); 167 167 } else if (const Diff * diff = dyn_cast<const Diff>(re)) { 168 return isNullable(diff->getLH()) && !isNullable(diff->getRH()); 168 // a Diff of Seq({}) and an Assertion represents a complemented assertion. 169 return false; 169 170 } else if (const Intersect * e = dyn_cast<const Intersect>(re)) { 170 171 return isNullable(e->getLH()) && isNullable(e->getRH()); -
icGREP/icgrep-devel/icgrep/re/re_parser.cpp
r5835 r5880 1 1 /* 2 * Copyright (c) 201 7International Characters.2 * Copyright (c) 2018 International Characters. 3 3 * This software is licensed to the public under the Open Software License 3.0. 4 4 * icgrep is a trademark of International Characters. … … 350 350 351 351 RE * RE_Parser::parseEscapedSet() { 352 bool complemented = false; 352 bool complemented = atany("BDSWQP"); 353 char escapeCh = get1(); 354 if (complemented) escapeCh = tolower(escapeCh); 353 355 RE * re = nullptr; 354 switch (*mCursor) { 355 case 'B': complemented = true; 356 switch (escapeCh) { 356 357 case 'b': 357 if (*++mCursor != '{') { 358 if (accept('{')) { 359 if (accept("g}")) { 360 re = makeZeroWidth("\\b{g}"); 361 return complemented ? makeZerowidthComplement(re) : re; 362 } else if (accept("w}")) { 363 ParseFailure("\\b{w} not yet supported."); 364 //return complemented ? makeZerowidthComplement(re) : re; 365 } else if (accept("l}")) { 366 ParseFailure("\\b{l} not yet supported."); 367 //return complemented ? makeZerowidthComplement(re) : re; 368 } else if (accept("s}")) { 369 ParseFailure("\\b{s} not yet supported."); 370 //return complemented ? makeZerowidthComplement(re) : re; 371 } else { 372 re = parsePropertyExpression(); 373 require('}'); 374 return complemented ? makeReNonBoundary(re) : makeReBoundary(re); 375 } 376 } else { 358 377 return complemented ? makeWordNonBoundary() : makeWordBoundary(); 359 } else {360 ++mCursor;361 if (isCharAhead('}')) {362 switch (*mCursor) {363 case 'g':364 re = complemented ? makeZeroWidth("\\B{g}") : makeZeroWidth("\\b{g}");365 ++mCursor;366 ++mCursor;367 break;368 case 'w': ParseFailure("\\b{w} not yet supported.");369 case 'l': ParseFailure("\\b{l} not yet supported.");370 case 's': ParseFailure("\\b{s} not yet supported.");371 // default: ParseFailure("Unrecognized boundary assertion");372 }373 }374 if (!re) {375 auto propExpr = parsePropertyExpression();376 if (*mCursor++ != '}') {377 ParseFailure("Malformed boundary assertion");378 }379 re = complemented ? makeReNonBoundary(propExpr) : makeReBoundary(propExpr);380 }381 return re;382 378 } 383 379 case 'd': 384 ++mCursor; 385 return makeDigitSet(); 386 case 'D': 387 ++mCursor; 388 return makeComplement(makeDigitSet()); 380 re = makeDigitSet(); 381 return complemented ? makeComplement(re) : re; 389 382 case 's': 390 ++mCursor; 391 return makeWhitespaceSet(); 392 case 'S': 393 ++mCursor; 394 return makeComplement(makeWhitespaceSet()); 383 re = makeWhitespaceSet(); 384 return complemented ? makeComplement(re) : re; 395 385 case 'w': 396 ++mCursor; 397 return makeWordSet(); 398 case 'W': 399 ++mCursor; 400 return makeComplement(makeWordSet()); 401 case 'Q': 402 complemented = true; 386 re = makeWordSet(); 387 return complemented ? makeComplement(re) : re; 403 388 case 'q': 404 if (*++mCursor != '{') { 405 ParseFailure("Malformed grapheme cluster expression"); 406 } 407 ++mCursor; 389 require('{'); 408 390 ParseFailure("Literal grapheme cluster expressions not yet supported."); 409 if (*mCursor != '}') { 410 ParseFailure("Malformed grapheme cluster expression"); 411 } 412 ++mCursor; 391 require('}'); 413 392 return complemented ? makeComplement(re) : re; 414 case 'P':415 complemented = true;416 393 case 'p': 417 if (*++mCursor != '{') { 418 ParseFailure("Malformed property expression"); 419 } 420 ++mCursor; 394 require('{'); 421 395 re = parsePropertyExpression(); 422 if (*mCursor != '}') { 423 ParseFailure("Malformed property expression"); 424 } 425 ++mCursor; 396 require('}'); 426 397 return complemented ? makeComplement(re) : re; 427 case 'X': 398 case 'X': { 428 399 // \X is equivalent to ".+?\b{g}"; proceed the minimal number of characters (but at least one) 429 400 // to get to the next extended grapheme cluster boundary. 430 ++mCursor; 431 return makeSeq({makeAny(), makeRep(makeSeq({makeZeroWidth("\\B{g}"), makeAny()}), 0, Rep::UNBOUNDED_REP), makeZeroWidth("\\b{g}")}); 401 RE * GCB = makeZeroWidth("\\b{g}"); 402 return makeSeq({makeAny(), makeRep(makeSeq({makeZerowidthComplement(GCB), makeAny()}), 0, Rep::UNBOUNDED_REP), GCB}); 403 } 432 404 case 'N': 433 ++mCursor;434 405 re = parseNamePatternExpression(); 435 406 assert (re); 436 407 return re; 437 408 case '<': 438 ++mCursor;439 409 return makeWordBegin(); 440 410 case '>': 441 ++mCursor;442 411 return makeWordEnd(); 443 412 default: … … 507 476 } 508 477 return s.str(); 509 }510 511 bool RE_Parser::isCharAhead(char c) {512 if (mCursor.remaining() < 2) {513 return false;514 }515 auto nextCursor = mCursor.pos() + 1;516 return *nextCursor == c;517 478 } 518 479 … … 811 772 } 812 773 774 RE * RE_Parser::makeZerowidthComplement(RE * s) { 775 return makeDiff(makeSeq({}), s); 776 } 777 813 778 RE * RE_Parser::makeWordBoundary() { 814 779 Name * wordC = makeWordSet(); -
icGREP/icgrep-devel/icgrep/re/re_parser.h
r5835 r5880 1 1 /* 2 * Copyright (c) 2014- 6International Characters.2 * Copyright (c) 2014-8 International Characters. 3 3 * This software is licensed to the public under the Open Software License 3.0. 4 4 * icgrep is a trademark of International Characters. … … 204 204 205 205 RE * makeComplement(RE * s); 206 RE * makeZerowidthComplement(RE * s); 207 206 208 RE * makeWordBoundary(); 207 209 RE * makeWordNonBoundary(); … … 242 244 static std::string canonicalize(const cursor_t begin, const cursor_t end); 243 245 244 bool isCharAhead(char c);245 246 246 LLVM_ATTRIBUTE_NORETURN void InvalidUTF8Encoding(); 247 247
Note: See TracChangeset
for help on using the changeset viewer.