Ignore:
Timestamp:
Feb 24, 2018, 9:33:57 AM (14 months ago)
Author:
cameron
Message:

Grapheme cluster support: represent B{g} using Seq{} - b{g}; parser cleanups

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp

    r5796 r5880  
    1515#include <re/re_range.h>
    1616#include <re/printer_re.h>
     17#include <re/re_name_resolve.h>
    1718#include <vector>                  // for vector, allocator
    1819#include <llvm/Support/Casting.h>  // for dyn_cast, isa
     
    3940        if (n->getType() == Name::Type::ZeroWidth) {
    4041            const std::string nameString = n->getName();
    41             return (nameString == "\\b{g}") || (nameString == "\\B{g}");
     42            return nameString == "\\b{g}";
    4243        }
    4344        return false;
     
    7172RE * resolveGraphemeMode(RE * re, bool inGraphemeMode) {
    7273    if (isa<Name>(re)) {
    73         if (inGraphemeMode && (cast<Name>(re)->getName() == "."))
    74             return makeSeq({makeAny(), makeRep(makeSeq({makeZeroWidth("\\B{g}"), makeAny()}), 0, Rep::UNBOUNDED_REP), makeZeroWidth("\\b{g}")});
     74        if (inGraphemeMode && (cast<Name>(re)->getName() == ".")) {
     75            RE * GCB = makeZeroWidth("\\b{g}");
     76            RE * nonGCB = makeDiff(makeSeq({}), GCB);
     77            return makeSeq({makeAny(), makeRep(makeSeq({nonGCB, makeAny()}), 0, Rep::UNBOUNDED_REP), GCB});
     78        }
    7579        else return re;
    7680    }
     
    122126}
    123127
     128
     129#define Behind(x) makeLookBehindAssertion(x)
     130#define Ahead(x) makeLookAheadAssertion(x)
     131
     132void generateGraphemeClusterBoundaryRule(Name * const &property) {
     133    // 3.1.1 Grapheme Cluster Boundary Rules
     134   
     135    //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
     136    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
     137    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
     138    RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
     139   
     140    // Break at the start and end of text.
     141    RE * GCB_1 = makeStart();
     142    RE * GCB_2 = makeEnd();
     143    // Do not break between a CR and LF.
     144    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
     145    // Otherwise, break before and after controls.
     146    RE * GCB_4 = Behind(GCB_Control_CR_LF);
     147    RE * GCB_5 = Ahead(GCB_Control_CR_LF);
     148    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
     149   
     150    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
     151    RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
     152    RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
     153    RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
     154    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
     155    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
     156    // Do not break Hangul syllable sequences.
     157    RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
     158    RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
     159    RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
     160    // Do not break between regional indicator symbols.
     161    RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
     162    // Do not break before extending characters.
     163    RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
     164    // Do not break before SpacingMarks, or after Prepend characters.
     165    RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
     166    RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
     167    RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
     168    // Otherwise, break everywhere.
     169    RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
     170   
     171    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
     172    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)});
     173    gcb = resolveUnicodeProperties(gcb);
     174    property->setDefinition(gcb);
    124175}
     176
     177}
Note: See TracChangeset for help on using the changeset viewer.