Changeset 6245


Ignore:
Timestamp:
Dec 19, 2018, 2:39:18 PM (3 months ago)
Author:
cameron
Message:

Fixes for grapheme cluster breaking

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp

    r6194 r6245  
    147147#define notBehind(x) makeNegativeLookBehindAssertion(x)
    148148#define Ahead(x) makeLookAheadAssertion(x)
     149#define notAhead(x) makeNegativeLookAheadAssertion(x)
    149150
    150151RE * generateGraphemeClusterBoundaryRule(bool extendedGraphemeClusters) {
     
    167168    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
    168169    RE * GCB_Control = makeName("gcb", "control", Name::Type::UnicodeProperty);
    169     // Any single character that is not a control, CR or LF.
    170     RE * nonControl = makeDiff(makeAny(), makeAlt({GCB_CR, GCB_LF, GCB_Control}));
    171 
    172     // Now the various rules excluding grapheme cluster breaks.
    173    
    174     // There is no break for empty text.  (Inference from rules GB1, GB2).
    175     RE * GCX_1 = makeSeq({makeSOT(), makeEOT()});
    176    
    177     // Do not break between a CR and LF.  (Rule GB 3)
    178     // RE * GCX_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
     170    RE * GCB_Control_CR_LF = makeAlt({GCB_Control, GCB_CR, GCB_LF});
     171   
     172    // Break at the start and end of text.
     173    RE * GCB_1 = makeSOT();
     174    RE * GCB_2 = makeEOT();
     175    // Do not break between a CR and LF.
     176    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
     177    // Otherwise, break before and after controls.
     178    RE * GCB_4 = Behind(GCB_Control_CR_LF);
     179    RE * GCB_5 = Ahead(GCB_Control_CR_LF);
     180    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
     181   
    179182   
    180183    // Do not break Hangul syllable sequences.
     
    191194    RE * GCB_EX = makeName("gcb", "ex", Name::Type::UnicodeProperty);
    192195    RE * GCB_ZWJ = makeName("gcb", "zwj", Name::Type::UnicodeProperty);
    193     RE * GCX_9 = makeSeq({Behind(nonControl), Ahead(makeAlt({GCB_EX, GCB_ZWJ}))});
     196    RE * GCX_9 = makeSeq({notBehind(GCB_Control_CR_LF), Ahead(makeAlt({GCB_EX, GCB_ZWJ}))});
    194197
    195198    if (extendedGraphemeClusters) {
    196199        RE * GCB_SpacingMark = makeName("gcb", "sm", Name::Type::UnicodeProperty);
    197200        RE * GCB_Prepend = makeName("gcb", "pp", Name::Type::UnicodeProperty);
    198         RE * GCX_9a = makeSeq({Behind(nonControl), Ahead(GCB_SpacingMark)});
    199         RE * GCX_9b = makeSeq({Behind(GCB_Prepend), Ahead(nonControl)});
     201        RE * GCX_9a = makeSeq({notBehind(GCB_Control_CR_LF), Ahead(GCB_SpacingMark)});
     202        RE * GCX_9b = makeSeq({Behind(GCB_Prepend), notAhead(GCB_Control_CR_LF)});
    200203        GCX_9 = makeAlt({GCX_9, GCX_9a, GCX_9b});
    201204    }
     
    211214   
    212215    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    213     RE * GCX = makeAlt({GCX_1, GCX_6, GCX_7, GCX_8, GCX_9, GCX_11, GCX_12_13});
    214    
    215     RE * gcb = makeDiff(makeSeq(), GCX);
     216    RE * GCX = makeAlt({GCX_6, GCX_7, GCX_8, GCX_9, GCX_11, GCX_12_13});
     217   
     218    // Otherwise, break everywhere.
     219    RE * GCB_999 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
     220   
     221    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
     222    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_999, GCX)});
    216223    return gcb;
    217224}
Note: See TracChangeset for help on using the changeset viewer.