Changeset 5880


Ignore:
Timestamp:
Feb 24, 2018, 9:33:57 AM (12 months ago)
Author:
cameron
Message:

Grapheme cluster support: represent B{g} using Seq{} - b{g}; parser cleanups

Location:
icGREP/icgrep-devel/icgrep
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp

    r5872 r5880  
    11/*
    2  *  Copyright (c) 2015 International Characters.
     2 *  Copyright (c) 2018 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 *  icgrep is a trademark of International Characters.
     
    1717#include <re/re_parser.h>
    1818#include <re/re_name_resolve.h>
     19#include <re/grapheme_clusters.h>
    1920#include <re/re_compiler.h>
    2021#include "UCD/PropertyAliases.h"
     
    3334    llvm::report_fatal_error(errmsg);
    3435}
    35 
    36 #define Behind(x) makeLookBehindAssertion(x)
    37 #define Ahead(x) makeLookAheadAssertion(x)
    3836   
    3937   
    4038RE * UnicodeBreakRE() {
    4139    return makeAlt({makeCC(0x0A, 0x0C), makeSeq({makeCC(0x0D), makeCC(0x0A)}), makeSeq({makeCC(0x0D), makeNegativeLookAheadAssertion(makeCC(0x0A))})});
    42 }
    43 
    44 void generateGraphemeClusterBoundaryRule(Name * const &property) {
    45     // 3.1.1 Grapheme Cluster Boundary Rules
    46 
    47 //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
    48     RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
    49     RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
    50     RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
    51 
    52     // Break at the start and end of text.
    53     RE * GCB_1 = makeStart();
    54     RE * GCB_2 = makeEnd();
    55     // Do not break between a CR and LF.
    56     RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
    57     // Otherwise, break before and after controls.
    58     RE * GCB_4 = Behind(GCB_Control_CR_LF);
    59     RE * GCB_5 = Ahead(GCB_Control_CR_LF);
    60     RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
    61 
    62     RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
    63     RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
    64     RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
    65     RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
    66     RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
    67     RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
    68     // Do not break Hangul syllable sequences.
    69     RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
    70     RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
    71     RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
    72     // Do not break between regional indicator symbols.
    73     RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
    74     // Do not break before extending characters.
    75     RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
    76     // Do not break before SpacingMarks, or after Prepend characters.
    77     RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
    78     RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
    79     RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
    80     // Otherwise, break everywhere.
    81     RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
    82 
    83     //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    84     property->setDefinition(makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)}));
    8540}
    8641
     
    11570            property->setDefinition(makeDiff(makeAny(), unassigned));
    11671            return true;
    117         } else if (value == "\\b{g}" || value == "\\B{g}") {
     72        } else if (value == "\\b{g}") {
    11873            generateGraphemeClusterBoundaryRule(property);
    11974            return true;
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.h

    r5872 r5880  
    1616
    1717re::RE * UnicodeBreakRE();
    18 void generateGraphemeClusterBoundaryRule(re::Name * const &property);
    1918bool resolvePropertyDefinition(re::Name * const property);
    2019std::string resolvePropertyFunction(re::Name * const property);
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp

    r5796 r5880  
    1515#include <re/re_range.h>
    1616#include <re/printer_re.h>
     17#include <re/re_name_resolve.h>
    1718#include <vector>                  // for vector, allocator
    1819#include <llvm/Support/Casting.h>  // for dyn_cast, isa
     
    3940        if (n->getType() == Name::Type::ZeroWidth) {
    4041            const std::string nameString = n->getName();
    41             return (nameString == "\\b{g}") || (nameString == "\\B{g}");
     42            return nameString == "\\b{g}";
    4243        }
    4344        return false;
     
    7172RE * resolveGraphemeMode(RE * re, bool inGraphemeMode) {
    7273    if (isa<Name>(re)) {
    73         if (inGraphemeMode && (cast<Name>(re)->getName() == "."))
    74             return makeSeq({makeAny(), makeRep(makeSeq({makeZeroWidth("\\B{g}"), makeAny()}), 0, Rep::UNBOUNDED_REP), makeZeroWidth("\\b{g}")});
     74        if (inGraphemeMode && (cast<Name>(re)->getName() == ".")) {
     75            RE * GCB = makeZeroWidth("\\b{g}");
     76            RE * nonGCB = makeDiff(makeSeq({}), GCB);
     77            return makeSeq({makeAny(), makeRep(makeSeq({nonGCB, makeAny()}), 0, Rep::UNBOUNDED_REP), GCB});
     78        }
    7579        else return re;
    7680    }
     
    122126}
    123127
     128
     129#define Behind(x) makeLookBehindAssertion(x)
     130#define Ahead(x) makeLookAheadAssertion(x)
     131
     132void generateGraphemeClusterBoundaryRule(Name * const &property) {
     133    // 3.1.1 Grapheme Cluster Boundary Rules
     134   
     135    //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
     136    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
     137    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
     138    RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
     139   
     140    // Break at the start and end of text.
     141    RE * GCB_1 = makeStart();
     142    RE * GCB_2 = makeEnd();
     143    // Do not break between a CR and LF.
     144    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
     145    // Otherwise, break before and after controls.
     146    RE * GCB_4 = Behind(GCB_Control_CR_LF);
     147    RE * GCB_5 = Ahead(GCB_Control_CR_LF);
     148    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
     149   
     150    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
     151    RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
     152    RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
     153    RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
     154    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
     155    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
     156    // Do not break Hangul syllable sequences.
     157    RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
     158    RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
     159    RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
     160    // Do not break between regional indicator symbols.
     161    RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
     162    // Do not break before extending characters.
     163    RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
     164    // Do not break before SpacingMarks, or after Prepend characters.
     165    RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
     166    RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
     167    RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
     168    // Otherwise, break everywhere.
     169    RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
     170   
     171    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
     172    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)});
     173    gcb = resolveUnicodeProperties(gcb);
     174    property->setDefinition(gcb);
    124175}
     176
     177}
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.h

    r5772 r5880  
    55   
    66class RE;
    7 
     7    class Name;
     8   
    89bool hasGraphemeClusterBoundary(const RE * re);
    910   
    1011RE * resolveGraphemeMode(RE * re, bool inGraphemeMode);
     12
     13void generateGraphemeClusterBoundaryRule(Name * const &property);
    1114
    1215}
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5872 r5880  
    160160        AlignMarkers(marker, zero, pb);
    161161        PabloAST * ze = markerVar(zero);
    162         if (nameString == "\\B{g}") {
    163             ze = pb.createNot(ze);
    164         }
    165162        return makeMarker(markerPos(marker), pb.createAnd(markerVar(marker), ze, "zerowidth"));
    166163    } else {
  • icGREP/icgrep-devel/icgrep/re/re_nullable.cpp

    r5869 r5880  
    166166        return (re_rep->getLB() == 0) || isNullable(re_rep->getRE());
    167167    } else if (const Diff * diff = dyn_cast<const Diff>(re)) {
    168         return isNullable(diff->getLH()) && !isNullable(diff->getRH());
     168        // a Diff of Seq({}) and an Assertion represents a complemented assertion.
     169        return false;
    169170    } else if (const Intersect * e = dyn_cast<const Intersect>(re)) {
    170171        return isNullable(e->getLH()) && isNullable(e->getRH());
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5835 r5880  
    11/*
    2  *  Copyright (c) 2017 International Characters.
     2 *  Copyright (c) 2018 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 *  icgrep is a trademark of International Characters.
     
    350350   
    351351RE * RE_Parser::parseEscapedSet() {
    352     bool complemented = false;
     352    bool complemented = atany("BDSWQP");
     353    char escapeCh = get1();
     354    if (complemented) escapeCh = tolower(escapeCh);
    353355    RE * re = nullptr;
    354     switch (*mCursor) {
    355         case 'B': complemented = true;
     356    switch (escapeCh) {
    356357        case 'b':
    357             if (*++mCursor != '{') {
     358            if (accept('{')) {
     359                if (accept("g}")) {
     360                    re = makeZeroWidth("\\b{g}");
     361                    return complemented ? makeZerowidthComplement(re) : re;
     362                } else if (accept("w}")) {
     363                    ParseFailure("\\b{w} not yet supported.");
     364                    //return complemented ? makeZerowidthComplement(re) : re;
     365                } else if (accept("l}")) {
     366                    ParseFailure("\\b{l} not yet supported.");
     367                    //return complemented ? makeZerowidthComplement(re) : re;
     368                } else if (accept("s}")) {
     369                    ParseFailure("\\b{s} not yet supported.");
     370                    //return complemented ? makeZerowidthComplement(re) : re;
     371                } else {
     372                    re = parsePropertyExpression();
     373                    require('}');
     374                    return complemented ? makeReNonBoundary(re) : makeReBoundary(re);
     375                }
     376            } else {
    358377                return complemented ? makeWordNonBoundary() : makeWordBoundary();
    359             } else {
    360                 ++mCursor;
    361                 if (isCharAhead('}')) {
    362                     switch (*mCursor) {
    363                         case 'g':
    364                             re = complemented ? makeZeroWidth("\\B{g}") : makeZeroWidth("\\b{g}");
    365                             ++mCursor;
    366                             ++mCursor;
    367                             break;
    368                         case 'w': ParseFailure("\\b{w} not yet supported.");
    369                         case 'l': ParseFailure("\\b{l} not yet supported.");
    370                         case 's': ParseFailure("\\b{s} not yet supported.");
    371 //                        default: ParseFailure("Unrecognized boundary assertion");
    372                     }
    373                 }
    374                 if (!re) {
    375                     auto propExpr = parsePropertyExpression();
    376                     if (*mCursor++ != '}') {
    377                         ParseFailure("Malformed boundary assertion");
    378                     }
    379                     re = complemented ? makeReNonBoundary(propExpr) : makeReBoundary(propExpr);
    380                 }
    381                 return re;
    382378            }
    383379        case 'd':
    384             ++mCursor;
    385             return makeDigitSet();
    386         case 'D':
    387             ++mCursor;
    388             return makeComplement(makeDigitSet());
     380            re = makeDigitSet();
     381            return complemented ? makeComplement(re) : re;
    389382        case 's':
    390             ++mCursor;
    391             return makeWhitespaceSet();
    392         case 'S':
    393             ++mCursor;
    394             return makeComplement(makeWhitespaceSet());
     383            re = makeWhitespaceSet();
     384            return complemented ? makeComplement(re) : re;
    395385        case 'w':
    396             ++mCursor;
    397             return makeWordSet();
    398         case 'W':
    399             ++mCursor;
    400             return makeComplement(makeWordSet());
    401         case 'Q':
    402             complemented = true;
     386            re = makeWordSet();
     387            return complemented ? makeComplement(re) : re;
    403388        case 'q':
    404             if (*++mCursor != '{') {
    405                 ParseFailure("Malformed grapheme cluster expression");
    406             }
    407             ++mCursor;
     389            require('{');
    408390            ParseFailure("Literal grapheme cluster expressions not yet supported.");
    409             if (*mCursor != '}') {
    410                 ParseFailure("Malformed grapheme cluster expression");
    411             }
    412             ++mCursor;
     391            require('}');
    413392            return complemented ? makeComplement(re) : re;
    414         case 'P':
    415             complemented = true;
    416393        case 'p':
    417             if (*++mCursor != '{') {
    418                 ParseFailure("Malformed property expression");
    419             }
    420             ++mCursor;
     394            require('{');
    421395            re = parsePropertyExpression();
    422             if (*mCursor != '}') {
    423                 ParseFailure("Malformed property expression");
    424             }
    425             ++mCursor;
     396            require('}');
    426397            return complemented ? makeComplement(re) : re;
    427         case 'X':
     398        case 'X': {
    428399            // \X is equivalent to ".+?\b{g}"; proceed the minimal number of characters (but at least one)
    429400            // to get to the next extended grapheme cluster boundary.
    430             ++mCursor;
    431             return makeSeq({makeAny(), makeRep(makeSeq({makeZeroWidth("\\B{g}"), makeAny()}), 0, Rep::UNBOUNDED_REP), makeZeroWidth("\\b{g}")});
     401            RE * GCB = makeZeroWidth("\\b{g}");
     402            return makeSeq({makeAny(), makeRep(makeSeq({makeZerowidthComplement(GCB), makeAny()}), 0, Rep::UNBOUNDED_REP), GCB});
     403        }
    432404        case 'N':
    433             ++mCursor;
    434405            re = parseNamePatternExpression();
    435406            assert (re);
    436407            return re;
    437408        case '<':
    438             ++mCursor;
    439409            return makeWordBegin();
    440410        case '>':
    441             ++mCursor;
    442411            return makeWordEnd();
    443412        default:
     
    507476    }
    508477    return s.str();
    509 }
    510 
    511 bool RE_Parser::isCharAhead(char c) {
    512     if (mCursor.remaining() < 2) {
    513         return false;
    514     }
    515     auto nextCursor = mCursor.pos() + 1;
    516     return *nextCursor == c;
    517478}
    518479
     
    811772}
    812773
     774RE * RE_Parser::makeZerowidthComplement(RE * s) {
     775    return makeDiff(makeSeq({}), s);
     776}
     777
    813778RE * RE_Parser::makeWordBoundary() {
    814779    Name * wordC = makeWordSet();
  • icGREP/icgrep-devel/icgrep/re/re_parser.h

    r5835 r5880  
    11/*
    2  *  Copyright (c) 2014-6 International Characters.
     2 *  Copyright (c) 2014-8 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 *  icgrep is a trademark of International Characters.
     
    204204
    205205    RE * makeComplement(RE * s);
     206    RE * makeZerowidthComplement(RE * s);
     207
    206208    RE * makeWordBoundary();
    207209    RE * makeWordNonBoundary();
     
    242244    static std::string canonicalize(const cursor_t begin, const cursor_t end);
    243245
    244     bool isCharAhead(char c);
    245 
    246246    LLVM_ATTRIBUTE_NORETURN void InvalidUTF8Encoding();
    247247
Note: See TracChangeset for help on using the changeset viewer.