Changeset 4388


Ignore:
Timestamp:
Jan 2, 2015, 9:28:01 AM (4 years ago)
Author:
cameron
Message:

Add derived core properties such as math, alpha; refine the property resolver

Location:
icGREP/icgrep-devel/icgrep
Files:
1 added
14 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/Blocks.h

    r4380 r4388  
    22#define BLOCKS_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/DerivedCoreProperties.h

    r4380 r4388  
    22#define DERIVEDCOREPROPERTIES_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/DerivedGeneralCategory.h

    r4380 r4388  
    22#define DERIVEDGENERALCATEGORY_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/EastAsianWidth.h

    r4380 r4388  
    22#define EASTASIANWIDTH_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/HangulSyllableType.h

    r4380 r4388  
    22#define HANGULSYLLABLETYPE_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/LineBreak.h

    r4380 r4388  
    22#define LINEBREAK_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/PropList.h

    r4380 r4388  
    22#define PROPLIST_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/PropertyAliases.h

    r4380 r4388  
    22#define PROPERTYALIASES_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
     
    2828    Pat_WS, QMark, Radical, SD, STerm, Term, UIdeo, Upper, VS, WSpace, XIDC,
    2929    XIDS, XO_NFC, XO_NFD, XO_NFKC, XO_NFKD
     30  };
     31  const std::vector<std::string> property_enum_name = {
     32"cjkAccountingNumeric", "cjkOtherNumeric", "cjkPrimaryNumeric", "nv",
     33    "cf", "cjkCompatibilityVariant", "dm", "FC_NFKC", "lc", "NFKC_CF",
     34    "scf", "slc", "stc", "suc", "tc", "uc", "bmg", "bpb", "cjkIICore",
     35    "cjkIRG_GSource", "cjkIRG_HSource", "cjkIRG_JSource", "cjkIRG_KPSource",
     36    "cjkIRG_KSource", "cjkIRG_MSource", "cjkIRG_TSource", "cjkIRG_USource",
     37    "cjkIRG_VSource", "cjkRSUnicode", "isc", "JSN", "na", "na1",
     38    "Name_Alias", "scx", "age", "blk", "sc", "bc", "bpt", "ccc", "dt", "ea",
     39    "gc", "GCB", "hst", "InMC", "InSC", "jg", "jt", "lb", "NFC_QC",
     40    "NFD_QC", "NFKC_QC", "NFKD_QC", "nt", "SB", "WB", "AHex", "Alpha",
     41    "Bidi_C", "Bidi_M", "Cased", "CE", "CI", "Comp_Ex", "CWCF", "CWCM",
     42    "CWKCF", "CWL", "CWT", "CWU", "Dash", "Dep", "DI", "Dia", "Ext",
     43    "Gr_Base", "Gr_Ext", "Gr_Link", "Hex", "Hyphen", "IDC", "Ideo", "IDS",
     44    "IDSB", "IDST", "Join_C", "LOE", "Lower", "Math", "NChar", "OAlpha",
     45    "ODI", "OGr_Ext", "OIDC", "OIDS", "OLower", "OMath", "OUpper",
     46    "Pat_Syn", "Pat_WS", "QMark", "Radical", "SD", "STerm", "Term", "UIdeo",
     47    "Upper", "VS", "WSpace", "XIDC", "XIDS", "XO_NFC", "XO_NFD", "XO_NFKC",
     48    "XO_NFKD"
    3049  };
    3150  const std::vector<std::string> property_full_name = {
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjectTable.h

    r4380 r4388  
    22#define PROPERTYOBJECTTABLE_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/PropertyValueAliases.h

    r4383 r4388  
    22#define PROPERTYVALUEALIASES_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
     
    4545    const std::unordered_map<std::string, int> aliases_only_map = std::unordered_map<std::string, int> {
    4646      };
     47  }
     48
     49  namespace SCX_ns {
     50    enum value_t {
     51      Aghb, Arab, Armi, Armn, Avst, Bali, Bamu, Bass, Batk, Beng, Bopo,
     52      Brah, Brai, Bugi, Buhd, Cakm, Cans, Cari, Cham, Cher, Copt, Cprt,
     53      Cyrl, Deva, Dsrt, Dupl, Egyp, Elba, Ethi, Geor, Glag, Goth, Gran,
     54      Grek, Gujr, Guru, Hang, Hani, Hano, Hebr, Hira, Hmng, Hrkt, Ital,
     55      Java, Kali, Kana, Khar, Khmr, Khoj, Knda, Kthi, Lana, Laoo, Latn,
     56      Lepc, Limb, Lina, Linb, Lisu, Lyci, Lydi, Mahj, Mand, Mani, Mend,
     57      Merc, Mero, Mlym, Modi, Mong, Mroo, Mtei, Mymr, Narb, Nbat, Nkoo,
     58      Ogam, Olck, Orkh, Orya, Osma, Palm, Pauc, Perm, Phag, Phli, Phlp,
     59      Phnx, Plrd, Prti, Rjng, Runr, Samr, Sarb, Saur, Shaw, Shrd, Sidd,
     60      Sind, Sinh, Sora, Sund, Sylo, Syrc, Tagb, Takr, Tale, Talu, Taml,
     61      Tavt, Telu, Tfng, Tglg, Thaa, Thai, Tibt, Tirh, Ugar, Vaii, Wara,
     62      Xpeo, Xsux, Yiii, Zinh, Zyyy, Zzzz};
     63    const std::vector<std::string> enum_names = {
     64      "Aghb", "Arab", "Armi", "Armn", "Avst", "Bali", "Bamu", "Bass",
     65      "Batk", "Beng", "Bopo", "Brah", "Brai", "Bugi", "Buhd", "Cakm",
     66      "Cans", "Cari", "Cham", "Cher", "Copt", "Cprt", "Cyrl", "Deva",
     67      "Dsrt", "Dupl", "Egyp", "Elba", "Ethi", "Geor", "Glag", "Goth",
     68      "Gran", "Grek", "Gujr", "Guru", "Hang", "Hani", "Hano", "Hebr",
     69      "Hira", "Hmng", "Hrkt", "Ital", "Java", "Kali", "Kana", "Khar",
     70      "Khmr", "Khoj", "Knda", "Kthi", "Lana", "Laoo", "Latn", "Lepc",
     71      "Limb", "Lina", "Linb", "Lisu", "Lyci", "Lydi", "Mahj", "Mand",
     72      "Mani", "Mend", "Merc", "Mero", "Mlym", "Modi", "Mong", "Mroo",
     73      "Mtei", "Mymr", "Narb", "Nbat", "Nkoo", "Ogam", "Olck", "Orkh",
     74      "Orya", "Osma", "Palm", "Pauc", "Perm", "Phag", "Phli", "Phlp",
     75      "Phnx", "Plrd", "Prti", "Rjng", "Runr", "Samr", "Sarb", "Saur",
     76      "Shaw", "Shrd", "Sidd", "Sind", "Sinh", "Sora", "Sund", "Sylo",
     77      "Syrc", "Tagb", "Takr", "Tale", "Talu", "Taml", "Tavt", "Telu",
     78      "Tfng", "Tglg", "Thaa", "Thai", "Tibt", "Tirh", "Ugar", "Vaii",
     79      "Wara", "Xpeo", "Xsux", "Yiii", "Zinh", "Zyyy", "Zzzz"};
     80    const std::vector<std::string> value_names = {
     81      "Caucasian_Albanian", "Arabic", "Imperial_Aramaic", "Armenian",
     82      "Avestan", "Balinese", "Bamum", "Bassa_Vah", "Batak", "Bengali",
     83      "Bopomofo", "Brahmi", "Braille", "Buginese", "Buhid", "Chakma",
     84      "Canadian_Aboriginal", "Carian", "Cham", "Cherokee", "Coptic",
     85      "Cypriot", "Cyrillic", "Devanagari", "Deseret", "Duployan",
     86      "Egyptian_Hieroglyphs", "Elbasan", "Ethiopic", "Georgian",
     87      "Glagolitic", "Gothic", "Grantha", "Greek", "Gujarati", "Gurmukhi",
     88      "Hangul", "Han", "Hanunoo", "Hebrew", "Hiragana", "Pahawh_Hmong",
     89      "Katakana_Or_Hiragana", "Old_Italic", "Javanese", "Kayah_Li",
     90      "Katakana", "Kharoshthi", "Khmer", "Khojki", "Kannada", "Kaithi",
     91      "Tai_Tham", "Lao", "Latin", "Lepcha", "Limbu", "Linear_A", "Linear_B",
     92      "Lisu", "Lycian", "Lydian", "Mahajani", "Mandaic", "Manichaean",
     93      "Mende_Kikakui", "Meroitic_Cursive", "Meroitic_Hieroglyphs",
     94      "Malayalam", "Modi", "Mongolian", "Mro", "Meetei_Mayek", "Myanmar",
     95      "Old_North_Arabian", "Nabataean", "Nko", "Ogham", "Ol_Chiki",
     96      "Old_Turkic", "Oriya", "Osmanya", "Palmyrene", "Pau_Cin_Hau",
     97      "Old_Permic", "Phags_Pa", "Inscriptional_Pahlavi", "Psalter_Pahlavi",
     98      "Phoenician", "Miao", "Inscriptional_Parthian", "Rejang", "Runic",
     99      "Samaritan", "Old_South_Arabian", "Saurashtra", "Shavian", "Sharada",
     100      "Siddham", "Khudawadi", "Sinhala", "Sora_Sompeng", "Sundanese",
     101      "Syloti_Nagri", "Syriac", "Tagbanwa", "Takri", "Tai_Le",
     102      "New_Tai_Lue", "Tamil", "Tai_Viet", "Telugu", "Tifinagh", "Tagalog",
     103      "Thaana", "Thai", "Tibetan", "Tirhuta", "Ugaritic", "Vai",
     104      "Warang_Citi", "Old_Persian", "Cuneiform", "Yi", "Inherited",
     105      "Common", "Unknown"};
     106    const std::unordered_map<std::string, int> aliases_only_map = std::unordered_map<std::string, int> {
     107      {"qaac", SCX_ns::Copt}, {"qaai", SCX_ns::Zinh}};
    47108  }
    48109
  • icGREP/icgrep-devel/icgrep/UCD/ScriptExtensions.h

    r4380 r4388  
    22#define SCRIPTEXTENSIONS_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/UCD/Scripts.h

    r4380 r4388  
    22#define SCRIPTS_H
    33/*
    4  *  Copyright (c) 2014 International Characters, Inc.
     4 *  Copyright (c) 2015 International Characters, Inc.
    55 *  This software is licensed to the public under the Open Software License 3.0.
    66 *  icgrep is a trademark of International Characters, Inc.
  • icGREP/icgrep-devel/icgrep/compiler.cpp

    r4386 r4388  
    11/*
    2  *  Copyright (c) 2014 International Characters.
    3  *  This software is licensed to the public under the Open Software License 3.0.
    4  *  icgrep is a trademark of International Characters.
    5  */
    6 
    7 /*
    8  *  Copyright (c) 2014 International Characters.
     2 *  Copyright (c) 2015 International Characters.
    93 *  This software is licensed to the public under the Open Software License 3.0.
    104 *  icgrep is a trademark of International Characters.
     
    2923#include "UCD/precompiled_scx.h"
    3024#include "UCD/precompiled_blk.h"
     25#include "UCD/precompiled_derivedcoreproperties.h"
    3126
    3227#include "resolve_properties.cpp"
     
    160155    install_property_scx_fn_ptrs(pablo_compiler);
    161156    install_property_blk_fn_ptrs(pablo_compiler);
     157    install_property_DerivedCoreProperties_fn_ptrs(pablo_compiler);
    162158
    163159    LLVM_Gen_RetVal retVal = pablo_compiler.compile(main);
  • icGREP/icgrep-devel/icgrep/resolve_properties.cpp

    r4386 r4388  
    11/*
    2  *  Copyright (c) 2014 International Characters.
     2 *  Copyright (c) 2015 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 *  icgrep is a trademark of International Characters.
     
    2323#include "UCD/PropertyValueAliases.h"
    2424
    25 
     25class UnicodePropertyExpressionError : public std::exception {
     26public:
     27    UnicodePropertyExpressionError(const std::string && msg) noexcept : _msg(msg) {};
     28    const char* what() const noexcept { return _msg.c_str();};
     29private:
     30    inline UnicodePropertyExpressionError() noexcept {}
     31    const std::string _msg;
     32};
    2633
    2734std::string canonicalize(std::string prop_or_val) {
     
    3340            s += std::tolower(c, loc);
    3441        }
     42    }
     43    return s;
     44}
     45
     46std::string lowercase(std::string prop_or_val) {
     47    std::locale loc;
     48    std::string s = "";
     49    for (unsigned int i = 0; i < prop_or_val.length(); ++i) {
     50        char c = prop_or_val.at(i);
     51        s += std::tolower(c, loc);
    3552    }
    3653    return s;
     
    7087                auto propit = UCD::alias_map.find(prop);
    7188                if (propit == UCD::alias_map.end()) {
    72                     throw std::runtime_error("Unknown property value: " + prop);
     89                    throw UnicodePropertyExpressionError("Expected a property name, but '" + name->getNamespace() + "' found instead");
    7390                }
    7491                theprop = propit->second;
     
    7996                        name->setName("__get_gc_" + UCD::GC_ns::enum_names[valcode]);
    8097                    }
     98                    else throw UnicodePropertyExpressionError("Erroneous property value for general_category property");
    8199                }
    82100                else if (theprop == UCD::sc) {
     
    86104                        name->setName("__get_sc_" + UCD::SC_ns::enum_names[valcode]);
    87105                    }
     106                    else throw UnicodePropertyExpressionError("Erroneous property value for script property");
    88107                }
    89108                else if (theprop == UCD::scx) {
     
    93112                        name->setName("__get_scx_" + UCD::SC_ns::enum_names[valcode]);
    94113                    }
     114                    else throw UnicodePropertyExpressionError("Erroneous property value for script_extension property");
    95115                }
    96116                else if (theprop == UCD::blk) {
     
    100120                        name->setName("__get_blk_" + UCD::BLK_ns::enum_names[valcode]);
    101121                    }
     122                    else throw UnicodePropertyExpressionError("Erroneous property value for block property");
     123                }
     124                else if (UCD::property_object_table[theprop]->the_kind == UCD::BinaryProperty){
     125                    auto valit = UCD::Binary_ns::aliases_only_map.find(v);
     126                    if (valit == UCD::Binary_ns::aliases_only_map.end()) {
     127                        throw UnicodePropertyExpressionError("Erroneous property value for binary property " + UCD::property_full_name[theprop]);
     128                    }
     129                    if (valit->second == UCD::Binary_ns::Y) {
     130                        name->setName("__get_" + lowercase(UCD::property_enum_name[theprop]) + "_Y");
     131                        return;
     132                    }
     133                    else {
     134                        throw UnicodePropertyExpressionError("Negated binary property " + UCD::property_full_name[theprop] + " recognized, but not supported");
     135                    }
    102136                }
    103137                else {
    104                     throw std::runtime_error("Property " + UCD::property_full_name[theprop] + " recognized, but not supported in icgrep 1.0");
     138                    throw UnicodePropertyExpressionError("Property " + UCD::property_full_name[theprop] + " recognized, but not supported in icgrep 1.0");
    105139                }
    106140            }
     
    119153                    return;
    120154                }
     155                // Try as a binary property.
     156                auto propit = UCD::alias_map.find(v);
     157                if (propit == UCD::alias_map.end()) {
     158                    throw UnicodePropertyExpressionError("Expected a general category, script or binary property name, but '" + name->getName() + "' found instead");
     159                }
     160                theprop = propit->second;
     161                if (UCD::property_object_table[theprop]->the_kind == UCD::BinaryProperty) {
     162                    name->setName("__get_" + lowercase(UCD::property_enum_name[theprop]) + "_Y");
     163                    return;
     164                }
    121165                else {
    122                     throw std::runtime_error("Unknown property, aborting\n");
     166                    throw UnicodePropertyExpressionError("Error: property " + UCD::property_full_name[theprop] + " specified without a value");
    123167                }
    124168            }
     
    128172    }
    129173    else if (!isa<CC>(re) && !isa<Start>(re) && !isa<End>(re) && !isa<Any>(re)) {
    130         throw std::runtime_error("Unknown RE type in resolveProperties.");
     174        throw UnicodePropertyExpressionError("Unknown RE type in resolveProperties.");
    131175    }
    132176}
Note: See TracChangeset for help on using the changeset viewer.