Ignore:
Timestamp:
Jul 14, 2015, 10:46:10 PM (4 years ago)
Author:
nmedfort
Message:

Moved responsibility of handling 'special cases of Unicode TR #18' and 'compatibility properties of UTR #18 Annex C' into RE_Parser.

Location:
icGREP/icgrep-devel/icgrep
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp

    r4661 r4671  
    132132            }
    133133        }
    134         // Now try special cases of Unicode TR #18
    135         else if (value == "any") {
    136             name->setDefinition(makeAny());
    137         }
    138         else if (value == "assigned") {
    139             Name * Cn = makeName("Cn", Name::Type::UnicodeProperty);
    140             name->setDefinition(makeDiff(makeAny(), Cn));
    141         }
    142         else if (value == "ascii") {
    143             name->setFunctionName("__get_blk_ASCII");
    144         }
    145         // Now compatibility properties of UTR #18 Annex C
    146         else if (value == "xdigit") {
    147             Name * Nd = makeName("Nd", Name::Type::UnicodeProperty);
    148             Name * hexdigit = makeName("Hex_digit", Name::Type::UnicodeProperty);
    149             name->setDefinition(makeAlt({Nd, hexdigit}));
    150         }
    151         else if (value == "alnum") {
    152             Name * digit = makeName("Nd", Name::Type::UnicodeProperty);
    153             Name * alpha = makeName("alphabetic", Name::Type::UnicodeProperty);
    154             name->setDefinition(makeAlt({digit, alpha}));
    155         }
    156         else if (value == "blank") {
    157             Name * space_sep = makeName("space_separator", Name::Type::UnicodeProperty);
    158             CC * tab = makeCC(0x09);
    159             name->setDefinition(makeAlt({space_sep, tab}));
    160         }
    161         else if (value == "graph") {
    162             Name * space = makeName("space", Name::Type::UnicodeProperty);
    163             Name * ctrl = makeName("control", Name::Type::UnicodeProperty);
    164             Name * surr = makeName("surrogate", Name::Type::UnicodeProperty);
    165             Name * unassigned = makeName("Cn", Name::Type::UnicodeProperty);
    166             Name * nongraph = makeName("[^graph]", Name::Type::UnicodeProperty);
    167             nongraph->setDefinition(makeAlt({space, ctrl, surr, unassigned}));
    168             name->setDefinition(makeDiff(makeAny(), nongraph));
    169         }
    170         else if (value == "print") {
    171             Name * graph = makeName("graph", Name::Type::UnicodeProperty);
    172             Name * space_sep = makeName("space_separator", Name::Type::UnicodeProperty);
    173             name->setDefinition(makeAlt({graph, space_sep}));
    174         }
    175         else if (value == "word") {
    176             Name * alnum = makeName("alnum", Name::Type::UnicodeProperty);
    177             Name * mark = makeName("mark", Name::Type::UnicodeProperty);
    178             Name * conn = makeName("Connector_Punctuation", Name::Type::UnicodeProperty);
    179             Name * join = makeName("Join_Control", Name::Type::UnicodeProperty);
    180             name->setDefinition(makeAlt({alnum, mark, conn, join}));
    181         }
    182134        else {
    183135            throw UnicodePropertyExpressionError("Expected a general category, script or binary property name, but '" + name->getName() + "' found instead");
  • icGREP/icgrep-devel/icgrep/cc/cc_compiler.cpp

    r4657 r4671  
    7474    else if (Name * name = dyn_cast<Name>(re)) {
    7575        RE * def = name->getDefinition();
    76         if (LLVM_LIKELY(def != nullptr)) {
     76        if (def) {
    7777            if (!isa<CC>(def)) {
    7878                compileByteClasses(def);
  • icGREP/icgrep-devel/icgrep/cc/cc_namemap.cpp

    r4665 r4671  
    4141    else if (Name * name = dyn_cast<Name>(re)) {
    4242        RE * def = name->getDefinition();
    43         if (def && !isa<CC>(def)) {
    44             name->setDefinition(process(def, type));
     43        if (def) {
     44            if (!isa<CC>(def)) {
     45                name->setDefinition(process(def, type));
     46            }
    4547        }
    46         std::string classname = name->getName();
    47         auto f = mNameMap.find(classname);
    48         if (f == mNameMap.end()) {
     48        else {
     49
     50            std::string classname = name->getName();
     51            auto f = mNameMap.find(classname);
     52            if (f != mNameMap.end()) {
     53                return f->second;
     54            }
     55            insert(std::move(classname), name);
    4956            if (name->getType() == Name::Type::UnicodeProperty) {
    5057                resolveProperty(name);
    5158                RE * def = name->getDefinition();
    5259                if (def) {
    53                     name->setDefinition(process(def, type));
     60                    name->setDefinition(process(def, CC_type::UnicodeClass));
    5461                }
    5562            }
    56             return insert(std::move(classname), name);
     63
     64            return name;
    5765        }
    58         return f->second;
    5966    }
    6067    else if (CC * cc = dyn_cast<CC>(re)) {
    6168        std::string classname = cc->canonicalName(type);
    6269        auto f = mNameMap.find(classname);
    63         if (f == mNameMap.end()) {
    64             return insert(std::move(classname), (type == ByteClass) ? makeByteName(classname, cc) : makeName(classname, cc));
     70        if (f != mNameMap.end()) {
     71            return f->second;
    6572        }
    66         return f->second;
     73        return insert(std::move(classname), (type == ByteClass) ? makeByteName(classname, cc) : makeName(classname, cc));
    6774    }
    6875    return re;
  • icGREP/icgrep-devel/icgrep/cc/cc_namemap.hpp

    r4665 r4671  
    1414public:
    1515
    16     typedef std::unordered_map<std::string, re::Name*>          NameMap;
     16    typedef std::unordered_map<std::string, re::Name *>          NameMap;
    1717    typedef std::vector<re::Name*>                              NameVector;
    1818    typedef NameVector::const_iterator                          iterator;
  • icGREP/icgrep-devel/icgrep/generate_predefined_ucd_functions.cpp

    r4669 r4671  
    5959#endif
    6060
     61using property_list = std::vector<std::pair<std::string, size_t>>;
     62
    6163/** ------------------------------------------------------------------------------------------------------------- *
    6264 * @brief compileUnicodeSet
    6365 ** ------------------------------------------------------------------------------------------------------------- */
    64 void compileUnicodeSet(std::string name, const UnicodeSet & set, PabloCompiler & pc, Module * module, raw_ostream & out) {
     66size_t compileUnicodeSet(std::string name, const UnicodeSet & set, PabloCompiler & pc, Module * module) {
    6567    PabloFunction function = PabloFunction::Create(std::move(name));
    6668    Encoding encoding(Encoding::Type::UTF_8, 8);
     
    8082    // Now compile the function ...
    8183    auto func = pc.compile(function, module);
    82     out << "    p.InstallExternalFunction(\"" + name + "\", &" + name + ", " + std::to_string(func.second) + ");\n";
    8384    releaseSlabAllocatorMemory();
    84 }
    85 
    86 /** ------------------------------------------------------------------------------------------------------------- *
    87  * @brief generateUCDModule
    88  ** ------------------------------------------------------------------------------------------------------------- */
    89 Module * generateUCDModule() {
     85
     86    return func.second;
     87}
     88
     89/** ------------------------------------------------------------------------------------------------------------- *
     90 * @brief writePropertyInstaller
     91 ** ------------------------------------------------------------------------------------------------------------- */
     92
     93void writePropertyInstaller(property_list && properties) {
    9094
    9195    #ifdef USE_LLVM_3_5
     
    105109    out << "#ifndef PROPERTYINSTALL\n";
    106110    out << "#define PROPERTYINSTALL\n\n";
     111    out << "#include <include/simd-lib/bitblock.hpp>\n";
    107112    out << "#include <pablo/pablo_compiler.h>\n\n";
    108     out << "void install_properties(pablo::PabloCompiler & p) {\n";
     113    out << "namespace UCD {\n\n";
     114    out << "struct Input {\n    BitBlock bit[8];\n};\n\n";
     115    out << "struct Output {\n    BitBlock bit[1];\n};\n\n";
     116    for (auto prop : properties) {
     117        out << "extern \"C\" void " + prop.first + "(const Input &, BitBlock *, Output &);\n";
     118    }
     119    out << "\nvoid install_properties(pablo::PabloCompiler & p) {\n";
     120    for (auto prop : properties) {
     121        out << "    p.InstallExternalFunction(\"" + prop.first + "\", reinterpret_cast<void *>(&" + prop.first + "), " + std::to_string(prop.second) + ");\n";
     122    }
     123    out << "}\n}\n\n#endif\n";
     124    out.close();
     125}
     126
     127
     128/** ------------------------------------------------------------------------------------------------------------- *
     129 * @brief generateUCDModule
     130 ** ------------------------------------------------------------------------------------------------------------- */
     131Module * generateUCDModule() {
     132
     133    property_list properties;
    109134
    110135    PabloCompiler pc;
     
    115140                const UnicodeSet & set = enumObj->GetCodepointSet(canonicalize_value_name(value));
    116141                std::string name = "__get_" + property_enum_name[enumObj->getPropertyCode()] + "_" + value;
    117                 compileUnicodeSet(name, set, pc, module, out);
     142                properties.emplace_back(name, compileUnicodeSet(name, set, pc, module));
    118143            }
    119144        }
     
    122147                const UnicodeSet & set = extObj->GetCodepointSet(canonicalize_value_name(value));
    123148                std::string name = "__get_" + property_enum_name[extObj->getPropertyCode()] + "_" + value;
    124                 compileUnicodeSet(name, set, pc, module, out);
     149                properties.emplace_back(name, compileUnicodeSet(name, set, pc, module));
    125150            }
    126151        }
     
    128153            const UnicodeSet & set = binObj->GetCodepointSet(Binary_ns::Y);
    129154            std::string name = "__get_" + property_enum_name[binObj->getPropertyCode()] + "_Y";
    130             compileUnicodeSet(name, set, pc, module, out);
     155            properties.emplace_back(name, compileUnicodeSet(name, set, pc, module));
    131156        }
    132157    }
    133 
    134     out << "}\n\n#endif\n"; out.close();
    135158
    136159    // Print an error message if our module is malformed in any way.
    137160    verifyModule(*module, &dbgs());
     161
     162    writePropertyInstaller(std::move(properties));
    138163
    139164    return module;
     
    214239    // Ask the target to add backend passes as necessary.
    215240    if (Target->addPassesToEmitFile(PM, FOS, TargetMachine::CGFT_ObjectFile)) {
    216         throw std::runtime_error("Target does not support generation of this file type!\n");
     241        throw std::runtime_error("Target does not support generation of object file type!\n");
    217242    }
    218243
     
    231256    }
    232257    if (ObjectFilename.empty()) {
    233         ObjectFilename = "ucd.o";
     258        ObjectFilename = "pregenerated_properties.o";
    234259    }
    235260    Module * module = generateUCDModule();
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4660 r4671  
    1717#include <re/parsefailure.h>
    1818#include <UCD/CaseFolding_txt.h>
     19#include <sstream>
    1920#include <algorithm>
    2021
     
    366367    throw_incomplete_expression_error_if_end_of_stream();
    367368    if (isSetEscapeChar(*_cursor))
    368       return parse_escaped_set();
     369      return parseEscapedSet();
    369370    else
    370371      return build_CC(parse_escaped_codepoint());
    371372}
    372373
    373 RE * makeDigitSet() {
    374     return makeName("Nd", Name::Type::UnicodeProperty);
    375 }
    376 
    377 RE * makeWhitespaceSet() {
    378     return makeName("Whitespace", Name::Type::UnicodeProperty);
    379 }
    380 
    381 RE * makeWordSet() {
    382     return makeName("word", Name::Type::UnicodeProperty);
    383 }
    384 
    385 RE * makeComplement(RE * s) {
    386   return makeDiff(makeAny(), s);
    387 }
    388 
    389 RE * makeWordBoundary () {
    390     RE * wordC = makeWordSet();
    391     return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)}),
    392                     makeSeq({makeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)})});
    393 }
    394 
    395 RE * makeWordNonBoundary () {
    396     RE * wordC = makeWordSet();
    397     return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)}),
    398                     makeSeq({makeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)})});
    399 }
    400 
    401 RE * RE_Parser::parse_escaped_set() {
     374RE * RE_Parser::parseEscapedSet() {
    402375    bool complemented = false;
    403376    RE * s;
     
    433406            if (_cursor == _end || *_cursor != '{') throw ParseFailure("Malformed property expression");
    434407            ++_cursor;
    435             s = parse_property_expression();
     408            s = parsePropertyExpression();
    436409            if (_cursor == _end || *_cursor != '}') throw ParseFailure("Malformed property expression");
    437410            ++_cursor;
    438             if (complemented) return makeComplement(s);
    439             else return s;
     411            return complemented ? makeComplement(s) : s;
    440412        default:
    441413            throw ParseFailure("Internal error");
     
    487459}
    488460
    489 Name * RE_Parser::parse_property_expression() {
     461std::string RE_Parser::canonicalize(const cursor_t begin, const cursor_t end) {
     462    std::locale loc;
     463    std::stringstream s;
     464    for (auto i = begin; i != end; ++i) {
     465        switch (*i) {
     466            case '_': case ' ': case '-':
     467                break;
     468            default:
     469                s << std::tolower(*i, loc);
     470        }
     471    }
     472    return s.str();
     473}
     474
     475Name * RE_Parser::parsePropertyExpression() {
    490476    const cursor_t start = _cursor;
    491477    while (_cursor != _end && *_cursor != '}' and *_cursor != ':' and *_cursor != '=') {
     
    500486        }
    501487        // We have a property-name = value expression
    502         return makeName(std::string(start, prop_end), std::string(val_start, _cursor), Name::Type::UnicodeProperty);
    503     }
    504     return makeName(std::string(start, _cursor), Name::Type::UnicodeProperty);
     488        return resolvePropertyExpression(canonicalize(start, prop_end), canonicalize(val_start, _cursor));
     489    }
     490    return resolvePropertyExpression(canonicalize(start, _cursor));
     491}
     492
     493Name * RE_Parser::resolvePropertyExpression(std::string value) {
     494
     495    auto key = std::make_pair("", value);
     496    auto f = mNameMap.find(key);
     497    if (f != mNameMap.end()) {
     498        return f->second;
     499    }
     500
     501    Name * property = makeName(value, Name::Type::UnicodeProperty);
     502
     503    // Try special cases of Unicode TR #18
     504    if (value == "any") {
     505        property->setDefinition(makeAny());
     506    }
     507    else if (value == "ascii") {
     508        property->setDefinition(resolvePropertyExpression("blk", "ascii"));
     509    }
     510    else if (value == "assigned") {
     511        Name * unassigned = resolvePropertyExpression("cn");
     512        property->setDefinition(makeDiff(makeAny(), unassigned));
     513    }
     514    // Now compatibility properties of UTR #18 Annex C
     515    else if (value == "xdigit") {
     516        Name * digit = resolvePropertyExpression("nd");
     517        Name * hexdigit = resolvePropertyExpression("hexdigit");
     518        property->setDefinition(makeAlt({digit, hexdigit}));
     519    }
     520    else if (value == "alnum") {
     521        Name * digit = resolvePropertyExpression("nd");
     522        Name * alpha = resolvePropertyExpression("alphabetic");
     523        property->setDefinition(makeAlt({digit, alpha}));
     524    }
     525    else if (value == "blank") {
     526        Name * space_sep = resolvePropertyExpression("space_separator");
     527        CC * tab = makeCC(0x09);
     528        property->setDefinition(makeAlt({space_sep, tab}));
     529    }
     530    else if (value == "graph") {
     531        Name * space = resolvePropertyExpression("space");
     532        Name * ctrl = resolvePropertyExpression("control");
     533        Name * surr = resolvePropertyExpression("surrogate");
     534        Name * unassigned = resolvePropertyExpression("cn");
     535        property->setDefinition(makeDiff(makeAny(), makeAlt({space, ctrl, surr, unassigned})));
     536    }
     537    else if (value == "print") {
     538        Name * graph = resolvePropertyExpression("graph");
     539        Name * space_sep = resolvePropertyExpression("space_separator");
     540        property->setDefinition(makeAlt({graph, space_sep}));
     541    }
     542    else if (value == "word") {
     543        Name * alnum = resolvePropertyExpression("alnum");
     544        Name * mark = resolvePropertyExpression("mark");
     545        Name * conn = resolvePropertyExpression("connectorpunctuation");
     546        Name * join = resolvePropertyExpression("joincontrol");
     547        property->setDefinition(makeAlt({alnum, mark, conn, join}));
     548    }
     549
     550    mNameMap.emplace(std::move(key), property);
     551
     552    return property;
     553}
     554
     555Name * RE_Parser::resolvePropertyExpression(std::string namespaceValue, std::string nameValue) {
     556
     557    auto key = std::make_pair(namespaceValue, nameValue);
     558
     559    auto f = mNameMap.find(key);
     560    if (f != mNameMap.end()) {
     561        return f->second;
     562    }
     563
     564
     565
     566    Name * property = makeName(namespaceValue, nameValue, Name::Type::UnicodeProperty);
     567
     568    mNameMap.emplace(std::move(key), property);
     569
     570    return property;
    505571}
    506572
     
    675741                        _cursor++;
    676742                    }
    677                     RE * posixSet = parse_property_expression();
     743                    RE * posixSet = parsePropertyExpression();
    678744                    if (negated) posixSet = makeComplement(posixSet);
    679745                    subexprs.push_back(posixSet);
     
    704770                throw_incomplete_expression_error_if_end_of_stream();
    705771                if (isSetEscapeChar(*_cursor)) {
    706                     subexprs.push_back(parse_escaped_set());
     772                    subexprs.push_back(parseEscapedSet());
    707773                    lastItemKind = SetItem;
    708774                }
     
    886952}
    887953   
    888    
    889 }
     954RE * RE_Parser::makeComplement(RE * s) {
     955  return makeDiff(makeAny(), s);
     956}
     957
     958RE * RE_Parser::makeWordBoundary () {
     959    RE * wordC = makeWordSet();
     960    return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)}),
     961                    makeSeq({makeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)})});
     962}
     963
     964RE * RE_Parser::makeWordNonBoundary () {
     965    RE * wordC = makeWordSet();
     966    return makeAlt({makeSeq({makeNegativeLookBehindAssertion(wordC), makeNegativeLookAheadAssertion(wordC)}),
     967                    makeSeq({makeLookBehindAssertion(wordC), makeLookAheadAssertion(wordC)})});
     968}
     969
     970inline Name * RE_Parser::makeDigitSet() {
     971    return resolvePropertyExpression("nd");
     972}
     973
     974inline Name * RE_Parser::makeAlphaNumeric() {
     975    return resolvePropertyExpression("alnum");
     976}
     977
     978inline Name * RE_Parser::makeWhitespaceSet() {
     979    return resolvePropertyExpression("whitespace");
     980}
     981
     982inline Name * RE_Parser::makeWordSet() {
     983    return resolvePropertyExpression("word");
     984}
     985
     986}
  • icGREP/icgrep-devel/icgrep/re/re_parser.h

    r4614 r4671  
    1515#include <list>
    1616#include <memory>
     17#include <map>
    1718
    1819namespace re {
     
    4142private:
    4243
     44    using NameMap = std::map<std::pair<std::string, std::string>, re::Name *>;
     45
    4346    typedef std::string::const_iterator cursor_t;
    4447
     
    6568    RE * parse_escaped();
    6669
    67     RE * parse_escaped_set();
     70    RE * parseEscapedSet();
    6871
    6972    codepoint_t parse_utf8_codepoint();
    7073
    71     Name * parse_property_expression();
     74    Name * parsePropertyExpression();
    7275       
     76    RE * makeComplement(RE * s);
     77    RE * makeWordBoundary ();
     78    RE * makeWordNonBoundary ();
     79    Name * makeDigitSet();
     80    Name * makeAlphaNumeric();
     81    Name * makeWhitespaceSet();
     82    Name * makeWordSet();
     83    Name * resolvePropertyExpression(std::string nameValue);
     84
     85    Name * resolvePropertyExpression(std::string namespaceValue, std::string nameValue);
     86
    7387        CharsetOperatorKind getCharsetOperator();
    7488
     
    92106    void CC_add_range(CC * cc, codepoint_t lo, codepoint_t hi);
    93107
     108    static std::string canonicalize(const cursor_t begin, const cursor_t end);
     109
    94110private:
    95111
    96112    cursor_t                    _cursor;
    97113    const cursor_t              _end;
    98     ModeFlagSet fModeFlagSet;
     114    ModeFlagSet                 fModeFlagSet;
     115    NameMap                     mNameMap;
    99116};
    100117
Note: See TracChangeset for help on using the changeset viewer.