source: icGREP/icgrep-devel/icgrep/UCD/PropertyObjects.cpp @ 5656

Last change on this file since 5656 was 5648, checked in by cameron, 22 months ago

Regular expressions for property values: allow aliases, do not canonicalize (Unicode TR 18 - RL2.6)

File size: 6.9 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters, Inc.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters, Inc.
5 *
6 */
7
8#include "PropertyObjects.h"
9#include "PropertyObjectTable.h"
10#include <llvm/Support/Casting.h>
11#include <algorithm>
12#include <assert.h>
13#include <sstream>
14
15using namespace llvm;
16
17namespace UCD {
18
19std::string canonicalize_value_name(const std::string & prop_or_val) {
20    std::locale loc;
21    std::stringstream s;
22
23    for (char c : prop_or_val) {
24        if ((c != '_') && (c != ' ') && (c != '-')) {
25            s << std::tolower(c, loc);
26        }
27    }
28    return s.str();
29}
30
31int PropertyObject::GetPropertyValueEnumCode(const std::string & value_spec) {
32    throw std::runtime_error("Property " + value_spec + " unsupported.");
33}
34const std::string & PropertyObject::GetPropertyValueGrepString() {
35    throw std::runtime_error("Property Value Grep String unsupported.");
36}
37
38UnicodeSet UnsupportedPropertyObject::GetCodepointSet(const std::string &) {
39    throw std::runtime_error("Property " + UCD::property_full_name[the_property] + " unsupported.");
40}
41
42UnicodeSet UnsupportedPropertyObject::GetCodepointSet(const int) {
43    throw std::runtime_error("Property " + UCD::property_full_name[the_property] + " unsupported.");
44}
45
46const UnicodeSet & EnumeratedPropertyObject::GetCodepointSet(const std::string & value_spec) {
47    const int property_enum_val = GetPropertyValueEnumCode(value_spec);
48    if (property_enum_val < 0) {
49        throw std::runtime_error("Enumerated Property " + UCD::property_full_name[the_property] + ": unknown value: " + value_spec);
50    }
51    return GetCodepointSet(property_enum_val);
52}
53
54const UnicodeSet & EnumeratedPropertyObject::GetCodepointSet(const int property_enum_val) const {
55    assert (property_enum_val >= 0);
56    return *(property_value_sets[property_enum_val]);
57}
58
59std::vector<UnicodeSet> & EnumeratedPropertyObject::GetEnumerationBasisSets() {
60    // Return the previously computed vector of basis sets, if it exists.
61    if (LLVM_UNLIKELY(enumeration_basis_sets.empty())) {
62        // Otherwise compute and return.
63        // Basis set i is the set of all codepoints whose numerical enumeration code e
64        // has bit i set, i.e., (e >> i) & 1 == 1.
65        unsigned basis_count = 1;
66        while ((1UL << basis_count) < independent_enum_count) {
67            basis_count++;
68        }
69        for (unsigned i = 0; i < basis_count; i++) {
70            enumeration_basis_sets.push_back(UnicodeSet());
71            for (unsigned e = 0; e < independent_enum_count; e++) {
72                if (((e >> i) & 1UL) == 0) {
73                    enumeration_basis_sets[i] = enumeration_basis_sets[i] + *property_value_sets[e];
74                }
75            }
76        }
77    }
78    return enumeration_basis_sets;
79}
80
81const std::string & EnumeratedPropertyObject::GetPropertyValueGrepString() {
82    if (LLVM_LIKELY(mPropertyValueGrepString.empty())) {
83        std::stringstream buffer;
84        for (unsigned i = 0; i != property_value_full_names.size(); i++) {
85            buffer << property_value_full_names[i] + "\n";
86        }
87        for (unsigned i = 0; i != property_value_enum_names.size(); i++) {
88            if (property_value_enum_names[i] == property_value_full_names[i]) continue;
89            buffer << property_value_enum_names[i] + "\n";
90        }
91        for (auto & a : property_value_aliases) {
92            buffer << a.first + "\n";
93        }
94        mPropertyValueGrepString = buffer.str();
95    }
96    return mPropertyValueGrepString;
97}
98
99int EnumeratedPropertyObject::GetPropertyValueEnumCode(const std::string & value_spec) {
100    // The canonical full names are not stored in the precomputed alias map,
101    // to save space in the executable.   Add them if the property is used.
102    if (uninitialized) {
103        for (unsigned i = 0; i != property_value_full_names.size(); i++) {
104            property_value_aliases.insert({canonicalize_value_name(property_value_full_names[i]), i});
105        }
106        for (unsigned i = 0; i != property_value_enum_names.size(); i++) {
107            property_value_aliases.insert({canonicalize_value_name(property_value_enum_names[i]), i});
108        }
109        uninitialized = false;
110    }
111    const auto valit = property_value_aliases.find(value_spec);
112    if (valit == property_value_aliases.end())
113        return -1;
114    return valit->second;
115}
116
117PropertyObject::iterator ExtensionPropertyObject::begin() const {
118    if (const auto * obj = dyn_cast<EnumeratedPropertyObject>(property_object_table[base_property])) {
119        return obj->begin();
120    }
121    throw std::runtime_error("Iterators unsupported for this type of PropertyObject.");
122}
123
124PropertyObject::iterator ExtensionPropertyObject::end() const {
125    if (const auto * obj = dyn_cast<EnumeratedPropertyObject>(property_object_table[base_property])) {
126        return obj->end();
127    }
128    throw std::runtime_error("Iterators unsupported for this type of PropertyObject.");
129}
130
131const UnicodeSet & ExtensionPropertyObject::GetCodepointSet(const std::string & value_spec) {
132    int property_enum_val = GetPropertyValueEnumCode(value_spec);
133    if (property_enum_val == -1) {
134        throw std::runtime_error("Extension Property " + UCD::property_full_name[the_property] +  ": unknown value: " + value_spec);
135    }
136    return GetCodepointSet(property_enum_val);
137}
138
139const UnicodeSet & ExtensionPropertyObject::GetCodepointSet(const int property_enum_val) const {
140    assert (property_enum_val >= 0);
141    return *(property_value_sets[property_enum_val]);
142}
143
144int ExtensionPropertyObject::GetPropertyValueEnumCode(const std::string & value_spec) {
145    return property_object_table[base_property]->GetPropertyValueEnumCode(value_spec);
146}
147
148const std::string & ExtensionPropertyObject::GetPropertyValueGrepString() {
149    return property_object_table[base_property]->GetPropertyValueGrepString();
150}
151
152const UnicodeSet & BinaryPropertyObject::GetCodepointSet(const std::string & value_spec) {
153    int property_enum_val = Binary_ns::Y;
154    if (value_spec.length() != 0) {
155        auto valit = Binary_ns::aliases_only_map.find(value_spec);
156        if (valit == Binary_ns::aliases_only_map.end()) {
157            throw std::runtime_error("Binary Property " + UCD::property_full_name[the_property] +  ": bad value: " + value_spec);
158        }
159        property_enum_val = valit->second;
160    }
161    return GetCodepointSet(property_enum_val);
162}
163
164const UnicodeSet & BinaryPropertyObject::GetCodepointSet(const int property_enum_val) {
165    if (property_enum_val == Binary_ns::Y) {
166        return mY;
167    }
168    if (mNoUninitialized) {
169        mN = ~mY;
170        mNoUninitialized = false;
171    }
172    return mN;
173}
174
175const std::string & BinaryPropertyObject::GetPropertyValueGrepString() {
176    if (mPropertyValueGrepString.empty()) {
177        std::stringstream buffer;
178        for (const auto & prop : Binary_ns::aliases_only_map) {
179            buffer << std::get<0>(prop) + "\n";
180        }
181        mPropertyValueGrepString = buffer.str();
182    }
183    return mPropertyValueGrepString;
184}
185
186}
Note: See TracBrowser for help on using the repository browser.