Changeset 6161


Ignore:
Timestamp:
Sep 15, 2018, 5:56:24 PM (7 days ago)
Author:
cameron
Message:

Simplify Unicode name and anchor resolution, excludeCC

Location:
icGREP/icgrep-devel/icgrep
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp

    r5998 r6161  
    6767        } else if (value == "\\b{g}") {
    6868            RE * gcb = generateGraphemeClusterBoundaryRule();
    69             property->setDefinition(resolveUnicodeProperties(gcb));
     69            property->setDefinition(resolveUnicodeNames(gcb));
    7070            return true;
    7171        } else if (value == "^s") {  // "start anchor (^) in single-line mode"
     
    109109                // resolve a regular expression
    110110                re::RE * propValueRe = RE_Parser::parse(value.substr(1), re::DEFAULT_MODE, re::PCRE, false);
    111                 propValueRe = re::resolveNames(propValueRe);  // Recursive name resolution may be required.
     111                propValueRe = re::resolveUnicodeNames(propValueRe);  // Recursive name resolution may be required.
    112112                return propObj->GetCodepointSetMatchingPattern(propValueRe);
    113113            }
  • icGREP/icgrep-devel/icgrep/kernels/grapheme_kernel.cpp

    r5888 r6161  
    4141        nameMap.emplace(name, nullptr);
    4242    }
    43     GCB = resolveUnicodeProperties(GCB);
     43    GCB = resolveUnicodeNames(GCB);
    4444    ucdCompiler.generateWithDefaultIfHierarchy(nameMap, pb);
    4545    re_compiler.addPrecompiled("UTF8_nonfinal", pb.createExtract(getInputStreamVar("nonFinal"), pb.getInteger(0)));
  • icGREP/icgrep-devel/icgrep/re/exclude_CC.cpp

    r5929 r6161  
    1919#include <re/re_intersect.h>
    2020#include <re/re_assertion.h>
     21#include <re/re_utility.h>
    2122#include <llvm/Support/Casting.h>
    2223#include <llvm/Support/ErrorHandling.h>
     
    2526
    2627namespace re {
     28 
     29class CC_Remover : public RE_Transformer {
     30public:
     31    CC_Remover(CC * toExclude) : RE_Transformer(), mExcludedCC(toExclude) {}
     32    RE * transformCC (CC * cc) override;
     33    RE * transformName (Name * name) override;
     34private:
     35    CC * mExcludedCC;
     36};
     37   
     38RE * CC_Remover::transformCC(CC * cc) {
     39    if (intersects(mExcludedCC, cc)) return subtractCC(cc, mExcludedCC);
     40    else return cc;
     41}
    2742
    28 bool mayMatchCC(RE * re, CC * cc) {
    29     if (CC * cc0 = dyn_cast<CC>(re)) {
    30         return intersects(cc0, cc);
    31     } else if (Seq * seq = dyn_cast<Seq>(re)) {
    32         for (auto s : * seq) {
    33             if (mayMatchCC(s, cc)) return true;
    34         }
    35         return false;
    36     } else if (Alt * alt = dyn_cast<Alt>(re)) {
    37         for (auto a : * alt) {
    38             if (mayMatchCC(a, cc)) return true;
    39         }
    40         return false;
    41     } else if (Rep * rep = dyn_cast<Rep>(re)) {
    42         return mayMatchCC(rep->getRE(), cc);
    43     } else if (Group * g = dyn_cast<Group>(re)) {
    44         return mayMatchCC(g->getRE(), cc);
    45     } else if (Diff * diff = dyn_cast<Diff>(re)) {
    46         // We only need exclude from the LH operand.
    47         return mayMatchCC(diff->getLH(), cc);
    48     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
    49         // We only need check  one of the operands.
    50         return mayMatchCC(e->getLH(), cc);
    51     } else if (isa<Start>(re) || isa<End>(re) || isa<Assertion>(re)) {
    52         return false;
    53     } else if (Name * n = dyn_cast<Name>(re)) {
    54         if (n->getType() ==  Name::Type::ZeroWidth) {
    55             return false;
    56         }
    57         RE * defn = n->getDefinition();
    58         return mayMatchCC(defn, cc);
    59     } else {
    60         report_fatal_error("exclude_CC: unhandled regexp type");
     43RE * CC_Remover::transformName(Name * n) {
     44    switch (n->getType()) {
     45        case Name::Type::Reference:
     46        case Name::Type::ZeroWidth:
     47            return n;
     48        case Name::Type::Capture:
     49            return makeCapture(n->getName(), transform(n->getDefinition()));
     50        default:
     51            RE * defn = n->getDefinition();
     52            if (const CC * cc0 = dyn_cast<CC>(defn)) {
     53                if (!intersects(mExcludedCC, cc0)) return n;
     54            }
     55            std::string cc_name = n->getName() + "--" + mExcludedCC->canonicalName();
     56            return makeName(cc_name, Name::Type::Unicode, transform(defn));
     57            /*
     58             return transform(defn);
     59             */
    6160    }
    6261}
    63  
    6462   
    6563RE * exclude_CC(RE * re, CC * cc) {
    66     if (!mayMatchCC(re, cc)) return re;
    67     if (CC * cc0 = dyn_cast<CC>(re)) {
    68         if (intersects(cc0, cc)) return subtractCC(cc0, cc);
    69         else return cc0;
    70     } else if (Seq * seq = dyn_cast<Seq>(re)) {
    71         std::vector<RE*> list;
    72         for (auto s : * seq) {
    73             list.push_back(exclude_CC(s, cc));
    74         }
    75         return makeSeq(list.begin(), list.end());
    76     } else if (Alt * alt = dyn_cast<Alt>(re)) {
    77         std::vector<RE*> list;
    78         for (auto a : * alt) {
    79             list.push_back(exclude_CC(a, cc));
    80         }
    81         return makeAlt(list.begin(), list.end());
    82     } else if (Rep * rep = dyn_cast<Rep>(re)) {
    83         return makeRep(exclude_CC(rep->getRE(), cc), rep->getLB(), rep->getUB());
    84     } else if (Group * g = dyn_cast<Group>(re)) {
    85         return makeGroup(g->getMode(), exclude_CC(g->getRE(), cc), g->getSense());
    86     } else if (Diff * diff = dyn_cast<Diff>(re)) {
    87         // We only need exclude from the LH operand.
    88         return makeDiff(exclude_CC(diff->getLH(), cc), diff->getRH());
    89     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
    90         // We only need exclude from the one of the operands.
    91         return makeIntersect(exclude_CC(e->getLH(), cc), e->getRH());
    92     } else if (isa<Start>(re) || isa<End>(re) || isa<Assertion>(re)) {
    93         return re;
    94     } else if (Name * n = dyn_cast<Name>(re)) {
    95         switch (n->getType()) {
    96             case Name::Type::Reference:
    97             case Name::Type::ZeroWidth:
    98                 return re;
    99             case Name::Type::Capture:
    100                 return makeCapture(n->getName(), exclude_CC(n->getDefinition(), cc));
    101             default:
    102                 RE * defn = n->getDefinition();
    103                 if (const CC * cc0 = dyn_cast<CC>(defn)) {
    104                     if (!intersects(cc0, cc)) return re;
    105                 }
    106                 std::string cc_name = n->getName() + "--" + cc->canonicalName();
    107                 return makeName(cc_name, Name::Type::Unicode, exclude_CC(defn, cc));
    108                 /*
    109                 return exclude_CC(defn, cc);
    110                 */
    111         }
    112     } else {
    113         report_fatal_error("exclude_CC: unhandled regexp type");
    114     }
     64    return CC_Remover(cc).transform(re);
    11565}
    11666}
  • icGREP/icgrep-devel/icgrep/re/exclude_CC.h

    r5929 r6161  
    1212    class CC;
    1313   
    14     /*  Return true if a string matched by r may contain a character in cc. */
    15     bool mayMatchCC(RE * r, CC * cc);
    16    
    1714    /* Transform a regular expression r so that matched strings do not include
    1815       matches to any character within the given character class cc.
  • icGREP/icgrep-devel/icgrep/re/re_name_resolve.cpp

    r6160 r6161  
    2828namespace re {
    2929 
    30 struct NameResolver {
    31     RE * resolveUnicodeProperties(RE * re) {
    32         if (Name * name = dyn_cast<Name>(re)) {
    33             auto f = mMemoizer.find(name);
    34             if (f == mMemoizer.end()) {
    35                 if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
    36                     name->setDefinition(resolveUnicodeProperties(name->getDefinition()));
    37                 } else if (LLVM_LIKELY(name->getType() == Name::Type::UnicodeProperty || name->getType() == Name::Type::ZeroWidth)) {
    38                     if (UCD::resolvePropertyDefinition(name)) {
    39                         name->setDefinition(resolveUnicodeProperties(name->getDefinition()));
    40                     } else {
    41                         name->setDefinition(makeCC(UCD::resolveUnicodeSet(name), &cc::Unicode));
    42                     }
    43                 } else {
    44                     UndefinedNameError(name);
    45                 }
    46                 re = mMemoizer.memoize(name);
    47             } else {
    48                 return *f;
    49             }
    50         } else if (Vector * vec = dyn_cast<Vector>(re)) {
    51             for (RE *& re : *vec) {
    52                 re = resolveUnicodeProperties(re);
    53             }
    54         } else if (Rep * rep = dyn_cast<Rep>(re)) {
    55             rep->setRE(resolveUnicodeProperties(rep->getRE()));
    56         } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    57             a->setAsserted(resolveUnicodeProperties(a->getAsserted()));
    58         } else if (Range * rg = dyn_cast<Range>(re)) {
    59             return makeRange(resolveUnicodeProperties(rg->getLo()),
    60                              resolveUnicodeProperties(rg->getHi()));
    61         } else if (Diff * diff = dyn_cast<Diff>(re)) {
    62             diff->setLH(resolveUnicodeProperties(diff->getLH()));
    63             diff->setRH(resolveUnicodeProperties(diff->getRH()));
    64         } else if (Intersect * ix = dyn_cast<Intersect>(re)) {
    65             ix->setLH(resolveUnicodeProperties(ix->getLH()));
    66             ix->setRH(resolveUnicodeProperties(ix->getRH()));
    67         } else if (Group * g = dyn_cast<Group>(re)) {
    68             g->setRE(resolveUnicodeProperties(g->getRE()));
    69         }
    70         return re;
    71     }
    72    
    73     RE * resolve(RE * re) {
    74         if (Name * name = dyn_cast<Name>(re)) {
    75             auto f = mMemoizer.find(name);
    76             if (f == mMemoizer.end()) {
    77                 if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
    78                     name->setDefinition(resolve(name->getDefinition()));
    79                 } else {
    80                     UndefinedNameError(name);
    81                 }
    82                 re = mMemoizer.memoize(name);
    83             } else {
    84                 return *f;
    85             }
    86         } else if (Vector * vec = dyn_cast<Vector>(re)) {
    87             for (RE *& re : *vec) {
    88                 re = resolve(re);
    89             }
    90         } else if (Rep * rep = dyn_cast<Rep>(re)) {
    91             rep->setRE(resolve(rep->getRE()));
    92         } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    93             a->setAsserted(resolve(a->getAsserted()));
    94         } else if (Range * rg = dyn_cast<Range>(re)) {
    95             return makeRange(resolve(rg->getLo()), resolve(rg->getHi()));
    96         } else if (Diff * diff = dyn_cast<Diff>(re)) {
    97             diff->setLH(resolve(diff->getLH()));
    98             diff->setRH(resolve(diff->getRH()));
    99         } else if (Intersect * ix = dyn_cast<Intersect>(re)) {
    100             ix->setLH(resolve(ix->getLH()));
    101             ix->setRH(resolve(ix->getRH()));
    102         } else if (Group * g = dyn_cast<Group>(re)) {
    103             g->setRE(resolve(g->getRE()));
    104         }
    105         return re;
    106     }
    107    
     30class UnicodeNameResolver : public RE_Transformer {
     31public:
     32    UnicodeNameResolver() : RE_Transformer() {}
     33    RE * transformName(Name * name) override;
    10834private:
    109     Memoizer                mMemoizer;
     35    Memoizer mMemoizer;
    11036};
    11137   
    112     RE * resolveUnicodeProperties(RE * re) {
    113         NameResolver nameResolver;
    114         return nameResolver.resolveUnicodeProperties(re);
     38RE * UnicodeNameResolver::transformName(Name * name) {
     39    auto f = mMemoizer.find(name);
     40    if (f == mMemoizer.end()) {
     41        if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
     42            name->setDefinition(transform(name->getDefinition()));
     43        } else if (LLVM_LIKELY(name->getType() == Name::Type::UnicodeProperty || name->getType() == Name::Type::ZeroWidth)) {
     44            if (UCD::resolvePropertyDefinition(name)) {
     45                name->setDefinition(transform(name->getDefinition()));
     46            } else {
     47                name->setDefinition(makeCC(UCD::resolveUnicodeSet(name), &cc::Unicode));
     48            }
     49        } else {
     50            UndefinedNameError(name);
     51        }
     52        return mMemoizer.memoize(name);
     53    } else {
     54        return *f;
    11555    }
    116    
    117     RE * resolveNames(RE * re) {
    118         NameResolver nameResolver;
    119         return nameResolver.resolve(re);
    120     }
    121    
    122    
    123    
    124 bool hasAnchor(const RE * re) {
    125     if (const Alt * alt = dyn_cast<Alt>(re)) {
    126         for (const RE * re : *alt) {
    127             if (hasAnchor(re)) {
    128                 return true;
    129             }
    130         }
    131         return false;
    132     } else if (const Seq * seq = dyn_cast<Seq>(re)) {
    133         for (const RE * re : *seq) {
    134             if (hasAnchor(re)) {
    135                 return true;
    136             }
    137         }
    138         return false;
    139     } else if (const Rep * rep = dyn_cast<Rep>(re)) {
    140         return hasAnchor(rep->getRE());
    141     } else if (isa<Start>(re)) {
    142         return true;
    143     } else if (isa<End>(re)) {
    144         return true;
    145     } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
    146         return hasAnchor(a->getAsserted());
    147     } else if (const Diff * diff = dyn_cast<Diff>(re)) {
    148         return hasAnchor(diff->getLH()) || hasAnchor(diff->getRH());
    149     } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
    150         return hasAnchor(e->getLH()) || hasAnchor(e->getRH());
    151     } else if (isa<Any>(re)) {
    152         return false;
    153     } else if (isa<CC>(re)) {
    154         return false;
    155     } else if (const Group * g = dyn_cast<Group>(re)) {
    156         return hasAnchor(g->getRE());
    157     } else if (const Name * n = dyn_cast<Name>(re)) {
    158         return hasAnchor(n->getDefinition());
    159     }
    160     return false; // otherwise
    16156}
    16257
    163 struct AnchorResolution {
     58RE * resolveUnicodeNames(RE * re) {
     59    return UnicodeNameResolver().transform(re);
     60}
     61
     62 
     63class AnchorResolution : public RE_Transformer {
     64public:
     65    AnchorResolution(RE * anchorRE);
     66    RE * transformStart(Start * s) override;
     67    RE * transformEnd(End * s) override;
     68
     69private:
    16470    RE * mAnchorRE;
    16571    bool mIsNegated;
    166     RE * resolve(RE * r);
    16772};
    168    
    169 RE * AnchorResolution::resolve(RE * r) {
    170     if (hasAnchor(r)) {
    171         if (const Alt * alt = dyn_cast<Alt>(r)) {
    172             std::vector<RE *> list;
    173             list.reserve(alt->size());
    174             for (RE * item : *alt) {
    175                 item = resolve(item);
    176                 list.push_back(item);
    177             }
    178             return makeAlt(list.begin(), list.end());
    179         } else if (const Seq * seq = dyn_cast<Seq>(r)) {
    180             std::vector<RE *> list;
    181             list.reserve(seq->size());
    182             for (RE * item : *seq) {
    183                 item = resolve(item);
    184                 list.push_back(item);
    185             }
    186             return makeSeq(list.begin(), list.end());
    187         } else if (Assertion * a = dyn_cast<Assertion>(r)) {
    188             return makeAssertion(resolve(a->getAsserted()), a->getKind(), a->getSense());
    189         } else if (Rep * rep = dyn_cast<Rep>(r)) {
    190             return makeRep(resolve(rep->getRE()), rep->getLB(), rep->getUB());
    191         } else if (Diff * diff = dyn_cast<Diff>(r)) {
    192             return makeDiff(resolve(diff->getLH()), resolve(diff->getRH()));
    193         } else if (Intersect * e = dyn_cast<Intersect>(r)) {
    194             return makeIntersect(resolve(e->getLH()), resolve(e->getRH()));
    195         } else if (isa<Start>(r)) {
    196             if (mIsNegated) {
    197                 return makeNegativeLookBehindAssertion(mAnchorRE);
    198             } else {
    199                 return makeAlt({makeSOT(), makeLookBehindAssertion(mAnchorRE)});
    200             }
    201         } else if (isa<End>(r)) {
    202             if (mIsNegated) {
    203                 return makeNegativeLookAheadAssertion(mAnchorRE);
    204             } else {
    205                 return makeAlt({makeEOT(), makeLookAheadAssertion(mAnchorRE)});
    206             }
    207         }
    208     }
    209     return r;
    210 }
    211 
    212 RE * resolveAnchors(RE * r, RE * breakRE) {
    213     AnchorResolution a;
     73 
     74AnchorResolution::AnchorResolution(RE * breakRE)
     75: RE_Transformer() {
    21476    if (const CC * cc = dyn_cast<CC>(breakRE)) {
    215         a.mIsNegated = true;
     77        mIsNegated = true;
    21678        if (cc->getAlphabet() == &cc::Unicode) {
    217             a.mAnchorRE = makeDiff(makeCC(0, 0x10FFFF), breakRE);
     79            mAnchorRE = makeDiff(makeCC(0, 0x10FFFF), breakRE);
    21880        } else if (cc->getAlphabet() == &cc::Byte) {
    219             a.mAnchorRE = makeDiff(makeByte(0, 0xFF), breakRE);
     81            mAnchorRE = makeDiff(makeByte(0, 0xFF), breakRE);
    22082        } else {
    22183            llvm::report_fatal_error("resolveAnchors: unexpected alphabet " + cc->getAlphabet()->getName());
    22284        }
    22385    } else {
    224         a.mIsNegated = false;
    225         a.mAnchorRE = breakRE;
     86        mIsNegated = false;
     87        mAnchorRE = breakRE;
    22688    }
    227     return a.resolve(r);
     89}
     90
     91RE * AnchorResolution::transformStart(Start * s) {
     92    if (mIsNegated) return makeNegativeLookBehindAssertion(mAnchorRE);
     93    return makeAlt({makeSOT(), makeLookBehindAssertion(mAnchorRE)});
     94}
     95
     96RE * AnchorResolution::transformEnd(End * e) {
     97    if (mIsNegated) return makeNegativeLookAheadAssertion(mAnchorRE);
     98    return makeAlt({makeEOT(), makeLookAheadAssertion(mAnchorRE)});
     99}
     100
     101RE * resolveAnchors(RE * r, RE * breakRE) {
     102    return AnchorResolution(breakRE).transform(r);
    228103}
    229104                                                       
  • icGREP/icgrep-devel/icgrep/re/re_name_resolve.h

    r5872 r6161  
    77    class Name;
    88
    9     RE * resolveUnicodeProperties(RE * re);
    10     RE * resolveNames(RE * re);
     9    RE * resolveUnicodeNames(RE * re);
    1110    RE * resolveAnchors(RE * r, RE * breakRE);
    1211
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r6160 r6161  
    7171        errs() << "resolveGraphemeMode:\n" << Printer_RE::PrintRE(r) << '\n';
    7272    }
    73     r = re::resolveUnicodeProperties(r);
     73    r = re::resolveUnicodeNames(r);
    7474    if (PrintOptions.isSet(ShowAllREs)) {
    75         errs() << "resolveUnicodeProperties:\n" << Printer_RE::PrintRE(r) << '\n';
     75        errs() << "resolveUnicodeNames:\n" << Printer_RE::PrintRE(r) << '\n';
    7676    }
    7777    r = resolveCaseInsensitiveMode(r, globallyCaseInsensitive);
     
    106106        errs() << "Star_Normal_Form:\n" << Printer_RE::PrintRE(r) << '\n';
    107107    }
    108     r = re::resolveNames(r);
    109     if (PrintOptions.isSet(ShowAllREs)) {
    110         errs() << "Resolve Names:\n" << Printer_RE::PrintRE(r) << '\n';
    111     }
    112108    if (codegen::OptLevel > 1) {
    113109        r = RE_Minimizer::minimize(r);
Note: See TracChangeset for help on using the changeset viewer.