Ignore:
Timestamp:
Feb 24, 2018, 9:33:57 AM (16 months ago)
Author:
cameron
Message:

Grapheme cluster support: represent B{g} using Seq{} - b{g}; parser cleanups

Location:
icGREP/icgrep-devel/icgrep/UCD
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp

    r5872 r5880  
    11/*
    2  *  Copyright (c) 2015 International Characters.
     2 *  Copyright (c) 2018 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 *  icgrep is a trademark of International Characters.
     
    1717#include <re/re_parser.h>
    1818#include <re/re_name_resolve.h>
     19#include <re/grapheme_clusters.h>
    1920#include <re/re_compiler.h>
    2021#include "UCD/PropertyAliases.h"
     
    3334    llvm::report_fatal_error(errmsg);
    3435}
    35 
    36 #define Behind(x) makeLookBehindAssertion(x)
    37 #define Ahead(x) makeLookAheadAssertion(x)
    3836   
    3937   
    4038RE * UnicodeBreakRE() {
    4139    return makeAlt({makeCC(0x0A, 0x0C), makeSeq({makeCC(0x0D), makeCC(0x0A)}), makeSeq({makeCC(0x0D), makeNegativeLookAheadAssertion(makeCC(0x0A))})});
    42 }
    43 
    44 void generateGraphemeClusterBoundaryRule(Name * const &property) {
    45     // 3.1.1 Grapheme Cluster Boundary Rules
    46 
    47 //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
    48     RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
    49     RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
    50     RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
    51 
    52     // Break at the start and end of text.
    53     RE * GCB_1 = makeStart();
    54     RE * GCB_2 = makeEnd();
    55     // Do not break between a CR and LF.
    56     RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
    57     // Otherwise, break before and after controls.
    58     RE * GCB_4 = Behind(GCB_Control_CR_LF);
    59     RE * GCB_5 = Ahead(GCB_Control_CR_LF);
    60     RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
    61 
    62     RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
    63     RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
    64     RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
    65     RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
    66     RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
    67     RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
    68     // Do not break Hangul syllable sequences.
    69     RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
    70     RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
    71     RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
    72     // Do not break between regional indicator symbols.
    73     RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
    74     // Do not break before extending characters.
    75     RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
    76     // Do not break before SpacingMarks, or after Prepend characters.
    77     RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
    78     RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
    79     RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
    80     // Otherwise, break everywhere.
    81     RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
    82 
    83     //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    84     property->setDefinition(makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)}));
    8540}
    8641
     
    11570            property->setDefinition(makeDiff(makeAny(), unassigned));
    11671            return true;
    117         } else if (value == "\\b{g}" || value == "\\B{g}") {
     72        } else if (value == "\\b{g}") {
    11873            generateGraphemeClusterBoundaryRule(property);
    11974            return true;
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.h

    r5872 r5880  
    1616
    1717re::RE * UnicodeBreakRE();
    18 void generateGraphemeClusterBoundaryRule(re::Name * const &property);
    1918bool resolvePropertyDefinition(re::Name * const property);
    2019std::string resolvePropertyFunction(re::Name * const property);
Note: See TracChangeset for help on using the changeset viewer.