Changeset 5672


Ignore:
Timestamp:
Oct 6, 2017, 11:36:55 AM (20 months ago)
Author:
cameron
Message:

StringOverride? properties (simple case conversion vs full case conversion)

Location:
icGREP/icgrep-devel
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5671 r5672  
    6767            property_object_map[property_code] = EnumeratedPropertyObject()
    6868        elif property_kind == "String":
    69             property_object_map[property_code] = StringPropertyObject()
     69            if property_code in ["uc", "lc", "tc", "cf"]:
     70                property_object_map[property_code] = StringOverridePropertyObject("s" + property_code)
     71            else:
     72                property_object_map[property_code] = StringPropertyObject()
    7073        elif property_kind == "Numeric":
    7174            property_object_map[property_code] = NumericPropertyObject()
     
    398401
    399402def parse_SpecialCasing_txt(property_object_map):
    400     data_records = []
    401403    f = open(UCD_config.UCD_src_dir + "/SpecialCasing.txt")
    402404    lines = f.readlines()
     
    418420    property_object_map['tc'].finalizeProperty()
    419421
     422
     423# CaseFolding.txt has four types of fold entries:
     424# S, C, F, T:  Simple, Common, Full and Turkic. 
     425# The SimpleCaseFold property is the set of mappings S+C,
     426# The FullCaseFold property is the set F+C
     427# There may be multiple entries per codepoint
     428
     429def parse_CaseFolding_txt():
     430    fold_map = {}
     431    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
     432    lines = f.readlines()
     433    for t in lines:
     434        if UCD_skip.match(t): continue  # skip comment and blank lines
     435        (cp, cp_hi, fields) = parse_data_record(t)
     436        (fold_type, fold_val) = (fields[0], fields[1])
     437        if not fold_type in fold_map: fold_map[fold_type] = {}
     438        if fold_type == 'S' or fold_type == 'C':
     439            # fold value is guaranteed to be a single codepoint
     440            fold_val = int(fold_val, 16)
     441        else:
     442            fold_val = [int(x, 16) for x in fold_val.split(" ")]
     443        fold_map[fold_type][cp] = fold_val
     444    return fold_map
     445
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5671 r5672  
    8181    reflexive_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8),
    8282    reflexive_set_value = reflexive_set.showC(12),
     83    explicitly_defined_cp_count = len(cps),
     84    explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     85    ))
     86
     87def emit_string_override_property(f, property_code, overridden_code, override_set, cp_value_map):
     88    s = string.Template(r"""    namespace ${prop_enum_up}_ns {
     89        /** Code Point Ranges for ${prop_enum} overriding values from ${overridden}
     90        ${overridden_set_ranges}**/
     91
     92        const UnicodeSet overridden_set
     93        ${overridden_set_value};
     94
     95        const unsigned buffer_length = ${buffer_length};
     96        const static char __attribute__ ((aligned (32))) string_buffer[${allocation_length}] = u8R"__(${string_buffer})__";
     97
     98        const static std::vector<codepoint_t> defined_cps = {
     99        ${explicitly_defined_cps}};
     100        static StringOverridePropertyObject property_object(${prop_enum},
     101                                                    ${overridden}_ns::property_object,
     102                                                    overridden_set,
     103                                                    static_cast<const char *>(string_buffer),
     104                                                    buffer_length,
     105                                                    defined_cps);
     106    }
     107""")
     108    cps = sorted(cp_value_map.keys())
     109    string_buffer = ""
     110    for cp in cps:
     111        string_buffer += cp_value_map[cp] + "\n"
     112    buffer_length = len(string_buffer.encode("utf-8"))
     113    f.write(s.substitute(prop_enum = property_code,
     114    prop_enum_up = property_code.upper(),
     115    overridden = overridden_code.upper(),
     116    string_buffer = string_buffer,
     117    buffer_length = buffer_length,
     118    allocation_length = (buffer_length + 255) & -256,
     119    overridden_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(override_set)], ',', 8),
     120    overridden_set_value = override_set.showC(12),
    83121    explicitly_defined_cp_count = len(cps),
    84122    explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     
    239277        elif isinstance(property_object, StringPropertyObject):
    240278            emit_string_property(f, property_code, property_object.null_str_set, property_object.reflexive_set, property_object.cp_value_map)
     279        elif isinstance(property_object, StringOverridePropertyObject):
     280            emit_string_override_property(f, property_code, property_object.overridden_code, property_object.overridden_set, property_object.cp_value_map)
    241281        elif isinstance(property_object, NumericPropertyObject):
    242282            emit_numeric_property(f, property_code, property_object.NaN_set, property_object.cp_value_map)
     
    305345        parse_SpecialCasing_txt(self.property_object_map)
    306346        f = cformat.open_header_file_for_write(basename)
    307         cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     347        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"UnicodeData.h"', '"unicode_set.h"'])
    308348        f.write("\nnamespace UCD {\n")
    309349        for p in ['lc', 'uc', 'tc']:
  • icGREP/icgrep-devel/UCD-scripts/UCD_property_objects.py

    r5670 r5672  
    231231            self.null_str_set = uset_union(self.null_str_set, uset_complement(uset_union(explicitly_defined_cps, self.reflexive_set)))
    232232
     233class StringOverridePropertyObject(PropertyObject):
     234    def __init__(self, overridden_code):
     235        PropertyObject.__init__(self)
     236        self.cp_value_map = {}
     237        self.overridden_code = overridden_code
     238        self.overridden_set = empty_uset()
     239       
     240    def getPropertyKind(self):
     241        return "StringOverride"
     242
     243    def addDataRecord(self, cp_lo, cp_hi, stringValue):
     244        if codepoint_String_regexp.match(stringValue):
     245            s = ""
     246            for cp in [int(x, 16) for x in stringValue.split(' ')]:
     247                s += chr(cp)
     248            stringValue = s
     249        else:
     250            raise Exception("Expecting codepoint string, but got " + stringValue)
     251        self.cp_value_map[cp] = stringValue
     252
     253    def finalizeProperty(self):
     254        explicitly_defined_cps = empty_uset()
     255        for cp in self.cp_value_map.keys():
     256            explicitly_defined_cps = uset_union(explicitly_defined_cps, singleton_uset(cp))
     257        self.overridden_set = explicitly_defined_cps
     258
    233259class ObsoletePropertyObject(PropertyObject):
    234260    def __init__(self):
  • icGREP/icgrep-devel/UCD-scripts/casefold.py

    r5653 r5672  
    1313import UCD_config
    1414from unicode_set import *
    15 
    16 
    17 
    18 #
    19 #  Processing files of the UCD
    20 #
    21 #  General format for skippable comments, blank lines
    22 UCD_skip = re.compile("^#.*$|^\s*$")
    23 
    24 #
    25 #  UCD Property File Format 4: property aliases
    26 #  PropertyAliases.txt
    27 #
    28 UCD_case_fold_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*([CSFT]);\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    29 
    30 def parse_CaseFolding_txt():
    31    fold_type = {}
    32    fold_value = {}
    33    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
    34    lines = f.readlines()
    35    for t in lines:
    36       if UCD_skip.match(t): continue  # skip comment and blank lines
    37       m = UCD_case_fold_regexp.match(t)
    38       if not m: raise Exception("Unknown case fold syntax: %s" % t)
    39       codepoint = int(m.group(1), 16)
    40       fold_t = m.group(2)
    41       fold_type[codepoint] = fold_t
    42       fold_val = m.group(3)
    43       if fold_t == 'T':
    44          print("Skipping Turkic entry")
    45          continue  # skip Turkic
    46       if fold_t == 'F':
    47           fold_val = [int(x, 16) for x in fold_val.split(" ")]
    48       else:
    49           fold_val = int(fold_val, 16)
    50       if codepoint in fold_value: fold_value[codepoint].append(fold_val)
    51       else: fold_value[codepoint] = [fold_val]
    52    return (fold_type, fold_value)
    53 
     15from UCD_parser import parse_CaseFolding_txt
    5416
    5517def simple_CaseFolding_BitSets(fold_map):
     
    7638   return BitDiffSet
    7739
    78 def simple_CaseClosure_map(fold_map):
     40def simple_CaseClosure_map(fold_data):
     41   simpleFoldMap = {}
     42   for k in fold_data['S'].keys(): simpleFoldMap[k] = fold_data['S'][k]
     43   for k in fold_data['C'].keys(): simpleFoldMap[k] = fold_data['C'][k]
    7944   cl_map = {}
    80    for k in fold_map.keys():
    81       folds = fold_map[k]
    82       for v in folds:
    83         if not isinstance(v, int): continue # skip nonsimple case folds
    84         if not v in cl_map: cl_map[v] = [k]
    85         else: cl_map[v].append(k)
    86         if not k in cl_map: cl_map[k] = [v]
    87         else: cl_map[k].append(v)
     45   for k in simpleFoldMap.keys():
     46      v = simpleFoldMap[k]
     47      if not v in cl_map: cl_map[v] = [k]
     48      else: cl_map[v].append(k)
     49      if not k in cl_map: cl_map[k] = [v]
     50      else: cl_map[k].append(v)
    8851   newEntries = True
    8952   while newEntries:
     
    188151
    189152def genCaseFolding_txt_h():
    190    (ft, fv) = parse_CaseFolding_txt()
    191    cm = simple_CaseClosure_map(fv)
     153   fold_data = parse_CaseFolding_txt()
     154   cm = simple_CaseClosure_map(fold_data)
    192155   f = cformat.open_header_file_for_write('CaseFolding_txt', 'casefold.py')
    193156   cformat.write_imports(f, ["<vector>", '"re/re_cc.h"'])
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjectTable.h

    r5671 r5672  
    5050    new UnsupportedPropertyObject(cjkPrimaryNumeric, PropertyObject::ClassTypeId::NumericProperty),
    5151    &NV_ns::property_object,
    52     new UnsupportedPropertyObject(cf, PropertyObject::ClassTypeId::StringProperty),
     52    new UnsupportedPropertyObject(cf, PropertyObject::ClassTypeId::StringOverrideProperty),
    5353    new UnsupportedPropertyObject(cjkCompatibilityVariant, PropertyObject::ClassTypeId::StringProperty),
    5454    &DM_ns::property_object,
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjects.cpp

    r5670 r5672  
    244244}
    245245
     246const UnicodeSet StringOverridePropertyObject::GetCodepointSet(const std::string & value_spec) {
     247    // First step: get the codepoints from the base object and then remove any overridden ones.
     248    UnicodeSet result_set = mBaseObject.GetCodepointSet(value_spec) - mOverriddenSet;
     249    // Now search for additional entries.
     250    unsigned val_bytes = value_spec.length();
     251    const char * value_str = value_spec.c_str();
     252    const char * search_str = mStringBuffer;
     253    unsigned buffer_line = 0;
     254    while (buffer_line < mExplicitCps.size()) {
     255        const char * eol = strchr(search_str, '\n');
     256        unsigned len = eol - search_str;
     257        if ((len == val_bytes) && (memcmp(search_str, value_str, len) == 0)) {
     258            result_set.insert(mExplicitCps[buffer_line]);
     259        }
     260        buffer_line++;
     261        search_str = eol+1;
     262    }
     263    return result_set;
     264}
     265   
    246266const std::string & ObsoletePropertyObject::GetPropertyValueGrepString() {
    247267    llvm::report_fatal_error("Property " + UCD::property_full_name[the_property] + " is obsolete.");
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjects.h

    r5670 r5672  
    2727        NumericProperty,
    2828        StringProperty,
     29        StringOverrideProperty,
    2930        ObsoleteProperty,
    3031        UnsupportedProperty
     
    155156};
    156157
     158class NumericPropertyObject : public PropertyObject {
     159public:
     160    static inline bool classof(const PropertyObject * p) {
     161        return p->getClassTypeId() == ClassTypeId::NumericProperty;
     162    }
     163    static inline bool classof(const void *) {
     164        return false;
     165    }
     166   
     167    NumericPropertyObject(UCD::property_t p, UnicodeSet NaN_Set, const char * string_buffer, unsigned bufsize, const std::vector<UCD::codepoint_t> & cps)
     168    : PropertyObject(p, ClassTypeId::NumericProperty)
     169    , mNaNCodepointSet(NaN_Set)
     170    , mStringBuffer(string_buffer)
     171    , mBufSize(bufsize)
     172    , mExplicitCps(cps)
     173    {
     174       
     175    }
     176    const UnicodeSet GetCodepointSet(const std::string & numeric_spec) override;
     177   
     178private:
     179    UnicodeSet mNaNCodepointSet;  // codepoints for which the property value is NaN (not a number).
     180    const char * mStringBuffer;  // buffer holding all string values for other codepoints, in sorted order.
     181    unsigned mBufSize;
     182    const std::vector<UCD::codepoint_t> & mExplicitCps;
     183};
     184
    157185class StringPropertyObject : public PropertyObject {
    158186public:
     
    184212};
    185213   
    186 class NumericPropertyObject : public PropertyObject {
    187 public:
    188     static inline bool classof(const PropertyObject * p) {
    189         return p->getClassTypeId() == ClassTypeId::NumericProperty;
    190     }
    191     static inline bool classof(const void *) {
    192         return false;
    193     }
    194    
    195     NumericPropertyObject(UCD::property_t p, UnicodeSet NaN_Set, const char * string_buffer, unsigned bufsize, const std::vector<UCD::codepoint_t> & cps)
    196     : PropertyObject(p, ClassTypeId::NumericProperty)
    197     , mNaNCodepointSet(NaN_Set)
     214class StringOverridePropertyObject : public PropertyObject {
     215public:
     216    static inline bool classof(const PropertyObject * p) {
     217        return p->getClassTypeId() == ClassTypeId::StringOverrideProperty;
     218    }
     219    static inline bool classof(const void *) {
     220        return false;
     221    }
     222    StringOverridePropertyObject(UCD::property_t p, PropertyObject & baseObj, UnicodeSet overridden, const char * string_buffer, unsigned bufsize, const std::vector<UCD::codepoint_t> & cps)
     223    : PropertyObject(p, ClassTypeId::StringOverrideProperty)
     224    , mBaseObject(baseObj)
     225    , mOverriddenSet(overridden)
    198226    , mStringBuffer(string_buffer)
    199227    , mBufSize(bufsize)
     
    202230       
    203231    }
    204     const UnicodeSet GetCodepointSet(const std::string & numeric_spec) override;
    205 
    206 private:
    207     UnicodeSet mNaNCodepointSet;  // codepoints for which the property value is NaN (not a number).
    208     const char * mStringBuffer;  // buffer holding all string values for other codepoints, in sorted order.
     232    const UnicodeSet GetCodepointSet(const std::string & value_spec) override;
     233   
     234private:
     235    PropertyObject & mBaseObject;  // the base object that provides default values for this property unless overridden.
     236    UnicodeSet mOverriddenSet;   // codepoints for which the baseObject value is overridden.
     237    const char * mStringBuffer;  // buffer holding all string values for overridden codepoints, in sorted order.
    209238    unsigned mBufSize;
    210239    const std::vector<UCD::codepoint_t> & mExplicitCps;
    211 };
    212 
     240   
     241};
     242   
    213243class ObsoletePropertyObject : public PropertyObject {
    214244public:
  • icGREP/icgrep-devel/icgrep/UCD/SpecialCasing.h

    r5670 r5672  
    1212#include "PropertyObjects.h"
    1313#include "PropertyValueAliases.h"
     14#include "UnicodeData.h"
    1415#include "unicode_set.h"
    1516
    1617namespace UCD {
    1718    namespace LC_ns {
    18         /** Code Point Ranges for lc mapping to <none>
    19         **/
    20 
    21         const UnicodeSet null_codepoint_set
    22                     {{{Empty, 34816}},
    23              {}};
    24 
    25         /** Code Point Ranges for lc mapping to <codepoint>
    26         [0000, 012f], [0131, 1f87], [1f90, 1f97], [1fa0, 1fa7],
    27         [1fb0, 1fbb], [1fbd, 1fcb], [1fcd, 1ffb], [1ffd, 10ffff]**/
    28         const UnicodeSet reflexive_set
    29                     {{{Full, 9}, {Mixed, 1}, {Full, 242}, {Mixed, 4}, {Full, 34560}},
    30              {0xfffeffff, 0x00ff00ff, 0xefff00ff, 0xffffefff, 0xefffffff}};
    31 
    32         const unsigned buffer_length = 112;
    33         const static char __attribute__ ((aligned (32))) string_buffer[256] = u8R"__(i̇
     19        /** Code Point Ranges for lc overriding values from SLC
     20        [00df, 00df], [0149, 0149], [01f0, 01f0], [0307, 0307],
     21        [0390, 0390], [03b0, 03b0], [0587, 0587], [1e96, 1e9a],
     22        [1f50, 1f50], [1f52, 1f52], [1f54, 1f54], [1f56, 1f56],
     23        [1f80, 1f87], [1f90, 1f97], [1fa0, 1fa7], [1fb2, 1fb4],
     24        [1fb6, 1fb7], [1fc2, 1fc4], [1fc6, 1fc7], [1fd2, 1fd3],
     25        [1fd6, 1fd7], [1fe2, 1fe4], [1fe6, 1fe7], [1ff2, 1ff4],
     26        [1ff6, 1ff7], [fb00, fb06], [fb13, fb17]**/
     27
     28        const UnicodeSet overridden_set
     29                    {{{Empty, 6}, {Mixed, 1}, {Empty, 3}, {Mixed, 1}, {Empty, 4},
     30              {Mixed, 1}, {Empty, 8}, {Mixed, 1}, {Empty, 3}, {Mixed, 2},
     31              {Empty, 14}, {Mixed, 1}, {Empty, 199}, {Mixed, 1}, {Empty, 5},
     32              {Mixed, 1}, {Empty, 1}, {Mixed, 4}, {Empty, 1752}, {Mixed, 1},
     33              {Empty, 32807}},
     34             {0x80000000, 0x00000200, 0x00010000, 0x00000080, 0x00010000,
     35              0x00010000, 0x00000080, 0x07c00000, 0x00550000, 0x00ff00ff,
     36              0x00dc00ff, 0x00cc00dc, 0x00dc00dc, 0x00f8007f}};
     37
     38        const unsigned buffer_length = 298;
     39        const static char __attribute__ ((aligned (32))) string_buffer[512] = u8R"__(ß
     40ʼn
     41Ç°
     42i̇
     43ΐ
     44ΰ
     45և
     46ẖ
     47ẗ
     48ẘ
     49ẙ
     50ẚ
     51ᜐ
     52ᜒ
     53᜔
     54᜖
    3455ៀ
    3556េ
     
    5778៊
    5879៧
     80៲
    5981៳
     82៎
     83៶
     84៷
     85ῂ
    6086ῃ
     87ῄ
     88ῆ
     89ῇ
     90ῒ
     91ΐ
     92ῖ
     93ῗ
     94á¿¢
     95á¿£
     96á¿€
     97á¿Š
     98ῧ
     99ῲ
    61100ῳ
     101á¿Ž
     102ῶ
     103á¿·
     104ff
     105fi
     106fl
     107ffi
     108ffl
     109ï¬
     110
     111st
     112ﬓ
     113ﬔ
     114ﬕ
     115ﬖ
     116ﬗ
    62117)__";
    63118
    64119        const static std::vector<codepoint_t> defined_cps = {
    65         0x0130, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e,
    66         0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e,
    67         0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae,
    68         0x1faf, 0x1fbc, 0x1fcc, 0x1ffc};
    69         static StringPropertyObject property_object(lc,
    70                                                     null_codepoint_set,
    71                                                     reflexive_set,
     120        0x00df, 0x0149, 0x01f0, 0x0307, 0x0390, 0x03b0, 0x0587, 0x1e96,
     121        0x1e97, 0x1e98, 0x1e99, 0x1e9a, 0x1f50, 0x1f52, 0x1f54, 0x1f56,
     122        0x1f80, 0x1f81, 0x1f82, 0x1f83, 0x1f84, 0x1f85, 0x1f86, 0x1f87,
     123        0x1f90, 0x1f91, 0x1f92, 0x1f93, 0x1f94, 0x1f95, 0x1f96, 0x1f97,
     124        0x1fa0, 0x1fa1, 0x1fa2, 0x1fa3, 0x1fa4, 0x1fa5, 0x1fa6, 0x1fa7,
     125        0x1fb2, 0x1fb3, 0x1fb4, 0x1fb6, 0x1fb7, 0x1fc2, 0x1fc3, 0x1fc4,
     126        0x1fc6, 0x1fc7, 0x1fd2, 0x1fd3, 0x1fd6, 0x1fd7, 0x1fe2, 0x1fe3,
     127        0x1fe4, 0x1fe6, 0x1fe7, 0x1ff2, 0x1ff3, 0x1ff4, 0x1ff6, 0x1ff7,
     128        0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0xfb05, 0xfb06, 0xfb13,
     129        0xfb14, 0xfb15, 0xfb16, 0xfb17};
     130        static StringOverridePropertyObject property_object(lc,
     131                                                    SLC_ns::property_object,
     132                                                    overridden_set,
    72133                                                    static_cast<const char *>(string_buffer),
    73134                                                    buffer_length,
     
    75136    }
    76137    namespace UC_ns {
    77         /** Code Point Ranges for uc mapping to <none>
    78         **/
    79 
    80         const UnicodeSet null_codepoint_set
    81                     {{{Empty, 34816}},
    82              {}};
    83 
    84         /** Code Point Ranges for uc mapping to <codepoint>
    85         [0000, 00de], [00e0, 0148], [014a, 01ef], [01f1, 038f],
    86         [0391, 03af], [03b1, 0586], [0588, 1e95], [1e9b, 1f4f],
    87         [1f51, 1f51], [1f53, 1f53], [1f55, 1f55], [1f57, 1f7f],
    88         [1f88, 1f8f], [1f98, 1f9f], [1fa8, 1fb1], [1fb5, 1fb5],
    89         [1fb8, 1fc1], [1fc5, 1fc5], [1fc8, 1fd1], [1fd4, 1fd5],
    90         [1fd8, 1fe1], [1fe5, 1fe5], [1fe8, 1ff1], [1ff5, 1ff5],
    91         [1ff8, faff], [fb07, fb12], [fb18, 10ffff]**/
    92         const UnicodeSet reflexive_set
    93                     {{{Full, 6}, {Mixed, 1}, {Full, 3}, {Mixed, 1}, {Full, 4},
    94               {Mixed, 1}, {Full, 12}, {Mixed, 2}, {Full, 14}, {Mixed, 1},
    95               {Full, 199}, {Mixed, 1}, {Full, 5}, {Mixed, 1}, {Full, 1},
    96               {Mixed, 4}, {Full, 1752}, {Mixed, 1}, {Full, 32807}},
    97              {0x7fffffff, 0xfffffdff, 0xfffeffff, 0xfffeffff, 0xfffeffff,
    98               0xffffff7f, 0xf83fffff, 0xffaaffff, 0xff00ff00, 0xff23ff00,
    99               0xff33ff23, 0xff23ff23, 0xff07ff80}};
    100 
    101         const unsigned buffer_length = 358;
    102         const static char __attribute__ ((aligned (32))) string_buffer[512] = u8R"__(Ss
    103 ÊŒN
     138        /** Code Point Ranges for uc overriding values from SUC
     139        [004e, 004e], [0066, 0066], [0069, 0069], [006c, 006c],
     140        [0073, 0074], [0130, 0130], [02be, 02be], [0300, 0301],
     141        [0308, 0308], [030a, 030a], [030c, 030c], [0313, 0313],
     142        [0331, 0331], [0342, 0342], [0345, 0345], [0565, 0565],
     143        [056b, 056b], [056d, 056d], [0576, 0576], [0582, 0582],
     144        [1f88, 1f8f], [1f98, 1f9f], [1fa8, 1faf], [1fbc, 1fbc],
     145        [1fcc, 1fcc], [1ffc, 1ffc]**/
     146
     147        const UnicodeSet overridden_set
     148                    {{{Empty, 2}, {Mixed, 2}, {Empty, 5}, {Mixed, 1}, {Empty, 11},
     149              {Mixed, 1}, {Empty, 2}, {Mixed, 3}, {Empty, 16}, {Mixed, 2},
     150              {Empty, 207}, {Mixed, 4}, {Empty, 34560}},
     151             {0x00004000, 0x00181240, 0x00010000, 0x40000000, 0x00081503,
     152              0x00020000, 0x00000024, 0x00402820, 0x00000004, 0xff00ff00,
     153              0x1000ff00, 0x00001000, 0x10000000}};
     154
     155        const unsigned buffer_length = 208;
     156        const static char __attribute__ ((aligned (32))) string_buffer[256] = u8R"__(ÊŒN
     157Ff
     158Ffi
     159Ffl
     160Ss
     161St
     162Ä°
     163AÊŸ
     164Ϋ̀
     165Ϋ́
     166T̈
     167Y̊
    104168J̌
    105 Î™ÌˆÌ
    106 Î¥ÌˆÌ
     169Ρ̓
     170H̱
     171Ω͂
     172Ω͂Í
     173
     174Մե
     175Մի
     176Մխ
     177Վն
    107178Եւ
    108 H̱
    109 T̈
    110 W̊
    111 Y̊
    112 AÊŸ
    113 Î¥Ì“
    114 Î¥Ì“Ì€
    115 Î¥Ì“́
    116 Î¥Ì“Í‚
    117179ៈ
    118180៉
     
    139201៮
    140202៯
    141 áŸºÍ
    142 
    143203៌
    144 Î†Í
    145 
    146 Î‘Í‚
    147 Î‘Í‚Í
    148 
    149 á¿ŠÍ
    150 
    151204ῌ
    152 Î‰Í
    153 
    154 Î—Í‚
    155 Î—Í‚Í
    156 
    157 Î™ÌˆÌ€
    158 Î™ÌˆÌ
    159 Î™Í‚
    160 Î™ÌˆÍ‚
    161 Î¥ÌˆÌ€
    162 Î¥ÌˆÌ
    163 Î¡Ì“
    164 Î¥Í‚
    165 Î¥ÌˆÍ‚
    166 á¿ºÍ
    167 
    168205ῌ
    169 ÎÍ
    170 
    171 Î©Í‚
    172 Î©Í‚Í
    173 
    174 Ff
    175 Fi
    176 Fl
    177 Ffi
    178 Ffl
    179 St
    180 St
    181 Õ„Õ¶
    182 Õ„Õ¥
    183 Õ„Õ«
    184 ÕŽÕ¶
    185 Õ„Õ­
    186206)__";
    187207
    188208        const static std::vector<codepoint_t> defined_cps = {
    189         0x00df, 0x0149, 0x01f0, 0x0390, 0x03b0, 0x0587, 0x1e96, 0x1e97,
    190         0x1e98, 0x1e99, 0x1e9a, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1f80,
    191         0x1f81, 0x1f82, 0x1f83, 0x1f84, 0x1f85, 0x1f86, 0x1f87, 0x1f90,
    192         0x1f91, 0x1f92, 0x1f93, 0x1f94, 0x1f95, 0x1f96, 0x1f97, 0x1fa0,
    193         0x1fa1, 0x1fa2, 0x1fa3, 0x1fa4, 0x1fa5, 0x1fa6, 0x1fa7, 0x1fb2,
    194         0x1fb3, 0x1fb4, 0x1fb6, 0x1fb7, 0x1fc2, 0x1fc3, 0x1fc4, 0x1fc6,
    195         0x1fc7, 0x1fd2, 0x1fd3, 0x1fd6, 0x1fd7, 0x1fe2, 0x1fe3, 0x1fe4,
    196         0x1fe6, 0x1fe7, 0x1ff2, 0x1ff3, 0x1ff4, 0x1ff6, 0x1ff7, 0xfb00,
    197         0xfb01, 0xfb02, 0xfb03, 0xfb04, 0xfb05, 0xfb06, 0xfb13, 0xfb14,
    198         0xfb15, 0xfb16, 0xfb17};
    199         static StringPropertyObject property_object(uc,
    200                                                     null_codepoint_set,
    201                                                     reflexive_set,
     209        0x004e, 0x0066, 0x0069, 0x006c, 0x0073, 0x0074, 0x0130, 0x02be,
     210        0x0300, 0x0301, 0x0308, 0x030a, 0x030c, 0x0313, 0x0331, 0x0342,
     211        0x0345, 0x0565, 0x056b, 0x056d, 0x0576, 0x0582, 0x1f88, 0x1f89,
     212        0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99,
     213        0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9,
     214        0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fbc, 0x1fcc,
     215        0x1ffc};
     216        static StringOverridePropertyObject property_object(uc,
     217                                                    SUC_ns::property_object,
     218                                                    overridden_set,
    202219                                                    static_cast<const char *>(string_buffer),
    203220                                                    buffer_length,
     
    205222    }
    206223    namespace TC_ns {
    207         /** Code Point Ranges for tc mapping to <none>
    208         **/
    209 
    210         const UnicodeSet null_codepoint_set
    211                     {{{Empty, 34816}},
    212              {}};
    213 
    214         /** Code Point Ranges for tc mapping to <codepoint>
    215         [0000, 00de], [00e0, 0148], [014a, 01ef], [01f1, 038f],
    216         [0391, 03af], [03b1, 0586], [0588, 1e95], [1e9b, 1f4f],
    217         [1f51, 1f51], [1f53, 1f53], [1f55, 1f55], [1f57, 1f7f],
    218         [1fb0, 1fb1], [1fb5, 1fb5], [1fb8, 1fbb], [1fbd, 1fc1],
    219         [1fc5, 1fc5], [1fc8, 1fcb], [1fcd, 1fd1], [1fd4, 1fd5],
    220         [1fd8, 1fe1], [1fe5, 1fe5], [1fe8, 1ff1], [1ff5, 1ff5],
    221         [1ff8, 1ffb], [1ffd, faff], [fb07, fb12], [fb18, 10ffff]**/
    222         const UnicodeSet reflexive_set
    223                     {{{Full, 6}, {Mixed, 1}, {Full, 3}, {Mixed, 1}, {Full, 4},
    224               {Mixed, 1}, {Full, 12}, {Mixed, 2}, {Full, 14}, {Mixed, 1},
    225               {Full, 199}, {Mixed, 1}, {Full, 5}, {Mixed, 1}, {Full, 1},
    226               {Empty, 1}, {Mixed, 3}, {Full, 1752}, {Mixed, 1},
    227               {Full, 32807}},
    228              {0x7fffffff, 0xfffffdff, 0xfffeffff, 0xfffeffff, 0xfffeffff,
    229               0xffffff7f, 0xf83fffff, 0xffaaffff, 0xef230000, 0xff33ef23,
    230               0xef23ff23, 0xff07ff80}};
    231 
    232         const unsigned buffer_length = 568;
    233         const static char __attribute__ ((aligned (32))) string_buffer[768] = u8R"__(SS
     224        /** Code Point Ranges for tc overriding values from STC
     225        [0046, 0046], [0049, 0049], [004c, 004c], [004e, 004e],
     226        [0053, 0054], [0130, 0130], [02be, 02be], [0300, 0301],
     227        [0308, 0308], [030a, 030a], [030c, 030c], [0313, 0313],
     228        [0331, 0331], [0342, 0342], [0399, 0399], [0535, 0535],
     229        [053b, 053b], [053d, 053d], [0546, 0546], [0552, 0552]**/
     230
     231        const UnicodeSet overridden_set
     232                    {{{Empty, 2}, {Mixed, 1}, {Empty, 6}, {Mixed, 1}, {Empty, 11},
     233              {Mixed, 1}, {Empty, 2}, {Mixed, 3}, {Empty, 1}, {Mixed, 1},
     234              {Empty, 12}, {Mixed, 2}, {Empty, 34773}},
     235             {0x00185240, 0x00010000, 0x40000000, 0x00081503, 0x00020000,
     236              0x00000004, 0x02000000, 0x28200000, 0x00040040}};
     237
     238        const unsigned buffer_length = 100;
     239        const static char __attribute__ ((aligned (32))) string_buffer[256] = u8R"__(FF
     240FFI
     241FFL
    234242ÊŒN
    235 J̌
    236 Î™ÌˆÌ
    237 Î¥ÌˆÌ
    238 ÔµÕ’
    239 H̱
    240 T̈
    241 W̊
    242 Y̊
     243SS
     244ST
     245Ä°
    243246AÊŸ
    244 Î¥Ì“
    245 Î¥Ì“Ì€
    246 Î¥Ì“́
    247 Î¥Ì“Í‚
    248 áŒˆÎ™
    249 áŒ‰Î™
    250 áŒŠÎ™
    251 áŒ‹Î™
    252 áŒŒÎ™
    253 áŒÎ™
    254 áŒŽÎ™
    255 áŒÎ™
    256 áŒˆÎ™
    257 áŒ‰Î™
    258 áŒŠÎ™
    259 áŒ‹Î™
    260 áŒŒÎ™
    261 áŒÎ™
    262 áŒŽÎ™
    263 áŒÎ™
    264 áŒšÎ™
    265 áŒ©Î™
    266 áŒªÎ™
    267 áŒ«Î™
    268 áŒ¬Î™
    269 áŒ­Î™
    270 áŒ®Î™
    271 áŒ¯Î™
    272 áŒšÎ™
    273 áŒ©Î™
    274 áŒªÎ™
    275 áŒ«Î™
    276 áŒ¬Î™
    277 áŒ­Î™
    278 áŒ®Î™
    279 áŒ¯Î™
    280 áœšÎ™
    281 áœ©Î™
    282 áœªÎ™
    283 áœ«Î™
    284 áœ¬Î™
    285 áœ­Î™
    286 áœ®Î™
    287 áœ¯Î™
    288 áœšÎ™
    289 áœ©Î™
    290 áœªÎ™
    291 áœ«Î™
    292 áœ¬Î™
    293 áœ­Î™
    294 áœ®Î™
    295 áœ¯Î™
    296 áŸºÎ™
    297 Î‘Ι
    298 Î†Î™
    299 Î‘Í‚
    300 Î‘͂Ι
    301 Î‘Ι
    302 á¿ŠÎ™
    303 Î—Ι
    304 Î‰Î™
    305 Î—Í‚
    306 Î—͂Ι
    307 Î—Ι
    308 Î™ÌˆÌ€
    309 Î™ÌˆÌ
    310 Î™Í‚
    311 Î™ÌˆÍ‚
    312247Ϋ̀
    313248Ϋ́
     249T̈
     250Y̊
     251J̌
    314252Ρ̓
    315 Î¥Í‚
    316 Î¥ÌˆÍ‚
    317 á¿ºÎ™
    318 Î©Î™
    319 ÎÎ™
     253H̱
    320254Ω͂
    321255Ω͂Ι
    322 Î©Î™
    323 FF
    324 FI
    325 FL
    326 FFI
    327 FFL
    328 ST
    329 ST
    330 Õ„Õ†
    331256ՄԵ
    332257ՄԻ
     258Õ„Ôœ
    333259ՎՆ
    334 Õ„Ôœ
     260ԵՒ
    335261)__";
    336262
    337263        const static std::vector<codepoint_t> defined_cps = {
    338         0x00df, 0x0149, 0x01f0, 0x0390, 0x03b0, 0x0587, 0x1e96, 0x1e97,
    339         0x1e98, 0x1e99, 0x1e9a, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1f80,
    340         0x1f81, 0x1f82, 0x1f83, 0x1f84, 0x1f85, 0x1f86, 0x1f87, 0x1f88,
    341         0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f90,
    342         0x1f91, 0x1f92, 0x1f93, 0x1f94, 0x1f95, 0x1f96, 0x1f97, 0x1f98,
    343         0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa0,
    344         0x1fa1, 0x1fa2, 0x1fa3, 0x1fa4, 0x1fa5, 0x1fa6, 0x1fa7, 0x1fa8,
    345         0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fb2,
    346         0x1fb3, 0x1fb4, 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc2, 0x1fc3, 0x1fc4,
    347         0x1fc6, 0x1fc7, 0x1fcc, 0x1fd2, 0x1fd3, 0x1fd6, 0x1fd7, 0x1fe2,
    348         0x1fe3, 0x1fe4, 0x1fe6, 0x1fe7, 0x1ff2, 0x1ff3, 0x1ff4, 0x1ff6,
    349         0x1ff7, 0x1ffc, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0xfb05,
    350         0xfb06, 0xfb13, 0xfb14, 0xfb15, 0xfb16, 0xfb17};
    351         static StringPropertyObject property_object(tc,
    352                                                     null_codepoint_set,
    353                                                     reflexive_set,
     264        0x0046, 0x0049, 0x004c, 0x004e, 0x0053, 0x0054, 0x0130, 0x02be,
     265        0x0300, 0x0301, 0x0308, 0x030a, 0x030c, 0x0313, 0x0331, 0x0342,
     266        0x0399, 0x0535, 0x053b, 0x053d, 0x0546, 0x0552};
     267        static StringOverridePropertyObject property_object(tc,
     268                                                    STC_ns::property_object,
     269                                                    overridden_set,
    354270                                                    static_cast<const char *>(string_buffer),
    355271                                                    buffer_length,
Note: See TracChangeset for help on using the changeset viewer.