Changeset 6191 for icGREP


Ignore:
Timestamp:
Nov 5, 2018, 10:33:39 AM (4 months ago)
Author:
cameron
Message:

Update grapheme cluster rules - CRLF and ExtendedPictographic? still to be addressed

Location:
icGREP/icgrep-devel/icgrep/re
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp

    r6177 r6191  
    145145
    146146#define Behind(x) makeLookBehindAssertion(x)
     147#define notBehind(x) makeNegativeLookBehindAssertion(x)
    147148#define Ahead(x) makeLookAheadAssertion(x)
    148149
    149 RE * generateGraphemeClusterBoundaryRule() {
     150RE * generateGraphemeClusterBoundaryRule(bool extendedGraphemeClusters) {
    150151    // 3.1.1 Grapheme Cluster Boundary Rules
    151    
    152     //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
     152    // Grapheme cluster boundary rules define a number of contexts where
     153    // breaks are not permitted.  In the following definitions, we identify
     154    // the points at which breaks are not permitted are identified by the
     155    // definitions marked GCX.
     156   
     157    // Rules GB1, GB2, GB4 and GB5 define rules where breaks occur overriding
     158    // later rules (specifically GB9, GB9a, GB9b).
     159    // Rules GB9 and GB9a are overridden by GB1 and GB4, to allow breaks
     160    // at start of text or after any control|CR|LF.  This is equivalent
     161    // to stating that the lookbehind context for GB9 and GB9b is any
     162    // non-control character (any actual character not in control|CR|LF).
     163    // Similarly, the overriding of GB9b simplifies to a lookahead assertion
     164    // on a noncontrol.
     165    //
    153166    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
    154167    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
    155     RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
    156    
    157     // Break at the start and end of text.
    158     RE * GCB_1 = makeSOT();
    159     RE * GCB_2 = makeEOT();
    160     // Do not break between a CR and LF.
    161     RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
    162     // Otherwise, break before and after controls.
    163     RE * GCB_4 = Behind(GCB_Control_CR_LF);
    164     RE * GCB_5 = Ahead(GCB_Control_CR_LF);
    165     RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
    166    
     168    RE * GCB_Control = makeName("gcb", "control", Name::Type::UnicodeProperty);
     169    // Any single character that is not a control, CR or LF.
     170    RE * nonControl = makeDiff(makeAny(), makeAlt({GCB_CR, GCB_LF, GCB_Control}));
     171
     172    // Now the various rules excluding grapheme cluster breaks.
     173   
     174    // There is no break for empty text.  (Inference from rules GB1, GB2).
     175    RE * GCX_1 = makeSeq({makeSOT(), makeEOT()});
     176   
     177    // Do not break between a CR and LF.  (Rule GB 3)
     178    // RE * GCX_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
     179   
     180    // Do not break Hangul syllable sequences.
    167181    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
    168182    RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
     
    170184    RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
    171185    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
     186    RE * GCX_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
     187    RE * GCX_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
     188    RE * GCX_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
     189   
     190    // Do not break before extendiers or zero-width joiners.
     191    RE * GCB_EX = makeName("gcb", "ex", Name::Type::UnicodeProperty);
     192    RE * GCB_ZWJ = makeName("gcb", "zwj", Name::Type::UnicodeProperty);
     193    RE * GCX_9 = makeSeq({Behind(nonControl), Ahead(makeAlt({GCB_EX, GCB_ZWJ}))});
     194
     195    if (extendedGraphemeClusters) {
     196        RE * GCB_SpacingMark = makeName("gcb", "sm", Name::Type::UnicodeProperty);
     197        RE * GCB_Prepend = makeName("gcb", "pp", Name::Type::UnicodeProperty);
     198        RE * GCX_9a = makeSeq({Behind(nonControl), Ahead(GCB_SpacingMark)});
     199        RE * GCX_9b = makeSeq({Behind(GCB_Prepend), Ahead(nonControl)});
     200        GCX_9 = makeAlt({GCX_9, GCX_9a, GCX_9b});
     201    }
     202
     203    // RE * ExtendedPictographic = makeName("Extended_Pictographic", Name::Type::UnicodeProperty));
     204    // RE * EmojiSeq = makeSeq({ExtendedPictographic, makeRep(GCB_EX, 0, Rep::UNBOUNDED_REP), GCB_ZWJ});
     205    // RE * GCX_11 = makeSeq({Behind(EmojiSeq), Ahead(ExtendedPictographic)});
     206   
    172207    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
    173     // Do not break Hangul syllable sequences.
    174     RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
    175     RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
    176     RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
    177     // Do not break between regional indicator symbols.
    178     RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
    179     // Do not break before extending characters.
    180     RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
    181     // Do not break before SpacingMarks, or after Prepend characters.
    182     RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
    183     RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
    184     RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
    185     // Otherwise, break everywhere.
    186     RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
     208    // Note: notBehind(RI) == sot | [^RI]
     209    RE * odd_RI_seq = makeSeq({notBehind(GCB_RI), makeRep(makeSeq({GCB_RI, GCB_RI}), 0, Rep::UNBOUNDED_REP), GCB_RI});
     210    RE * GCX_12_13 = makeSeq({Behind(odd_RI_seq), Ahead(GCB_RI)});
    187211   
    188212    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    189     RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)});
     213    RE * GCX = makeAlt({GCX_1, GCX_6, GCX_7, GCX_8, GCX_9, GCX_12_13});
     214   
     215    RE * gcb = makeDiff(makeSeq(), GCX);
    190216    return gcb;
    191217}
    192218
    193 }
     219
     220}
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.h

    r6169 r6191  
    1111RE * resolveGraphemeMode(RE * re, bool inGraphemeMode);
    1212
    13 RE * generateGraphemeClusterBoundaryRule();
     13RE * generateGraphemeClusterBoundaryRule(bool extendedGraphemeClusters = true);
    1414
    1515}
Note: See TracChangeset for help on using the changeset viewer.