Changeset 5951 for icGREP/icgrep-devel


Ignore:
Timestamp:
Apr 8, 2018, 12:33:46 PM (16 months ago)
Author:
cameron
Message:

Back reference analysis in support of future back reference compilation mode

Location:
icGREP/icgrep-devel/icgrep/re
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r5912 r5951  
    2121#include <llvm/Support/ErrorHandling.h>
    2222#include <llvm/Support/raw_ostream.h>
    23 
     23#include <set>
    2424using namespace llvm;
    2525
     
    236236        return std::make_pair(0, 0);
    237237    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
    238         const auto r1 = getUnicodeUnitLengthRange(diff->getLH());
    239         const auto r2 = getUnicodeUnitLengthRange(diff->getRH());
    240         return std::make_pair(std::min(r1.first, r2.first), std::max(r1.second, r2.second));
     238        // The range is determined by the first operand only.
     239        return getUnicodeUnitLengthRange(diff->getLH());
    241240    } else if (const Intersect * i = dyn_cast<Intersect>(re)) {
    242241        const auto r1 = getUnicodeUnitLengthRange(i->getLH());
    243242        const auto r2 = getUnicodeUnitLengthRange(i->getRH());
    244         return std::make_pair(std::min(r1.first, r2.first), std::max(r1.second, r2.second));
     243        // The matched string cannot be shorter than the largest of the min lengths
     244        // nor can it be longer than the smallest of the max lengths.
     245        return std::make_pair(std::max(r1.first, r2.first), std::min(r1.second, r2.second));
    245246    } else if (isa<CC>(re)) {
    246247        return std::make_pair(1, 1);
     
    262263    return std::make_pair(1, 1);
    263264}
     265   
     266bool isFixedLength(const RE * re) {
     267    if (isa<Alt>(re)) {
     268        auto range = getUnicodeUnitLengthRange(re);
     269        return range.first == range.second;
     270    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
     271        for (const RE * e : *seq) {
     272            if (!isFixedLength(e)) return false;
     273        }
     274        return true;
     275    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
     276        return (rep->getLB() == rep->getUB()) && isFixedLength(rep->getRE());
     277    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
     278        return isFixedLength(diff->getLH());
     279    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
     280        return isFixedLength(e->getLH()) || isFixedLength(e->getRH());
     281    } else if (const Group * g = dyn_cast<Group>(re)) {
     282        return isFixedLength(g->getRE());
     283    } else if (const Name * n = dyn_cast<Name>(re)) {
     284        return isFixedLength(n->getDefinition());
     285    }
     286    return true; // otherwise = CC, Any, Start, End, Range, Assertion
     287}
     288
    264289   
    265 int minMatchLength(RE * re) {
    266     if (Alt * alt = dyn_cast<Alt>(re)) {
     290int minMatchLength(const RE * re) {
     291    if (const Alt * alt = dyn_cast<Alt>(re)) {
    267292        int minAltLength = INT_MAX;
    268293        for (RE * re : *alt) {
     
    270295        }
    271296        return minAltLength;
    272     } else if (Seq * seq = dyn_cast<Seq>(re)) {
     297    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
    273298        int minSeqLength = 0;
    274299        for (RE * re : *seq) {
     
    276301        }
    277302        return minSeqLength;
    278     } else if (Rep * rep = dyn_cast<Rep>(re)) {
     303    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
    279304        if (rep->getLB() == 0) return 0;
    280305        else return (rep->getLB()) * minMatchLength(rep->getRE());
    281306    } else if (isa<Assertion>(re)) {
    282307        return 0;
    283     } else if (Diff * diff = dyn_cast<Diff>(re)) {
     308    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
    284309        return minMatchLength(diff->getLH());
    285     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
     310    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
    286311        return std::min(minMatchLength(e->getLH()), minMatchLength(e->getRH()));
    287312    } else if (isa<Any>(re)) {
     
    420445
    421446void ByteTestComplexity::gatherTests(RE * re) {
    422     if (CC * cc = dyn_cast<CC>(re)) {
     447    if (const CC * cc = dyn_cast<CC>(re)) {
    423448        if (cc->getAlphabet() == &cc::Unicode) {
    424449            gatherTests(toUTF8(re));
     
    451476    } else if (const Name * n = dyn_cast<Name>(re)) {
    452477        gatherTests(n->getDefinition());
    453     } else if (Alt * alt = dyn_cast<Alt>(re)) {
     478    } else if (const Alt * alt = dyn_cast<Alt>(re)) {
    454479        for (RE * item : *alt) {
    455480            gatherTests(item);
    456481        }
    457     } else if (Seq * seq = dyn_cast<Seq>(re)) {
     482    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
    458483        for (RE * item : *seq) {
    459484            gatherTests(item);
    460485        }
    461     } else if (Assertion * a = dyn_cast<Assertion>(re)) {
     486    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
    462487        gatherTests(a->getAsserted());
    463     } else if (Rep * rep = dyn_cast<Rep>(re)) {
     488    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
    464489        gatherTests(rep->getRE());
    465     } else if (Diff * diff = dyn_cast<Diff>(re)) {
     490    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
    466491        gatherTests(diff->getLH());
    467492        gatherTests(diff->getRH());
    468     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
     493    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
    469494        gatherTests(e->getLH());
    470495        gatherTests(e->getRH());
    471     } else if (Group * g = dyn_cast<Group>(re)) {
     496    } else if (const Group * g = dyn_cast<Group>(re)) {
    472497        gatherTests(g->getRE());
    473498    }
     
    483508
    484509bool hasTriCCwithinLimit(RE * r, unsigned byteCClimit, RE * & prefixRE, RE * & suffixRE) {
    485     if (Seq * seq = dyn_cast<Seq>(r)) {
     510    if (const Seq * seq = dyn_cast<Seq>(r)) {
    486511        if (seq->size() < 4) return false;
    487512        if (!isa<CC>(*seq->begin())) return false;
     
    523548}
    524549   
     550//
     551//  Back Reference Analysis
     552//
     553//  The definite-length back-reference implementation strategy requires that
     554//  each capture that has a back-reference be fixed in length and that
     555//  that each back-reference is a fixed length from its corresponding capture.
     556//
     557//   In analyzing a sequences of regular expression elements
     558//   e_0, e_1, ..., e_i, ..., e_j, ..., e_n
     559//   we say that e_j is within fixed-length range of e_i if
     560//   for all k: i < k < j, e_k is fixed length.
     561//
     562
     563bool DefiniteLengthBackReferencesOnly(const RE * re) {
     564    if (const Alt * alt = dyn_cast<Alt>(re)) {
     565        for (const RE * a : *alt) {
     566            if (!DefiniteLengthBackReferencesOnly(a)) return false;
     567        }
     568        return true;
     569    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
     570        // As we iterate through sequence elements, we keep track of captures
     571        // that are encountered and are still reachable through a series of
     572        // fixed length elements back from the current position.  As soon as
     573        // a variable length element is encounterd, the list of available_captures
     574        // is cleared.
     575        //
     576        std::set<const Name *> available_captures;
     577        for (const RE * e : *seq) {
     578            if (const Name * n = dyn_cast<Name>(e)) {
     579                auto T = n->getType();
     580                auto defn = n->getDefinition();
     581                if (T == Name::Type::Reference) {
     582                    if (available_captures.find(cast<Name>(defn)) == available_captures.end()) {
     583                        // Capture is not available.
     584                        return false;
     585                    } else {
     586                        continue;
     587                    }
     588                } else {
     589                    if (!DefiniteLengthBackReferencesOnly(defn)) return false;
     590                    if (isFixedLength(defn)) {
     591                        if (T == Name::Type::Capture) {
     592                            available_captures.emplace(n);
     593                        }
     594                    } else {
     595                        available_captures.clear();
     596                    }
     597                }
     598            }
     599            if (!DefiniteLengthBackReferencesOnly(e)) return false;
     600            if (!isFixedLength(e)) available_captures.clear();
     601        }
     602        return true;
     603    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
     604        return DefiniteLengthBackReferencesOnly(rep->getRE());
     605    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
     606        return DefiniteLengthBackReferencesOnly(diff->getLH()) && DefiniteLengthBackReferencesOnly(diff->getRH());
     607    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
     608        return DefiniteLengthBackReferencesOnly(e->getLH()) && DefiniteLengthBackReferencesOnly(e->getRH());
     609    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
     610        return DefiniteLengthBackReferencesOnly(a->getAsserted());
     611    } else if (const Group * g = dyn_cast<Group>(re)) {
     612        return DefiniteLengthBackReferencesOnly(g->getRE());
     613    } else if (const Name * n = dyn_cast<Name>(re)) {
     614        if (n->getType() == Name::Type::Reference) return false;
     615        return DefiniteLengthBackReferencesOnly(n->getDefinition());
     616    }
     617    return true; // otherwise = CC, Any, Start, End, Range
     618}
    525619
    526620void UndefinedNameError(const Name * n) {
  • icGREP/icgrep-devel/icgrep/re/re_analysis.h

    r5910 r5951  
    2222std::pair<int, int> getUnicodeUnitLengthRange(const RE * re);
    2323
    24 int minMatchLength(RE * re);
     24bool isFixedLength(const RE * re);
     25
     26int minMatchLength(const RE * re);
    2527
    2628bool unitBoundedRep(const RE * re);
     
    3638bool hasEndAnchor(const RE * r);
    3739   
     40bool DefiniteLengthBackReferencesOnly(const RE * re);
     41   
    3842void UndefinedNameError (const Name * n);
    3943}
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r5945 r5951  
    2424#include <re/grapheme_clusters.h>
    2525#include <llvm/Support/raw_ostream.h>
     26#include <llvm/Support/ErrorHandling.h>
    2627#include <toolchain/toolchain.h>
    2728
     
    119120    }
    120121
     122    if (!DefiniteLengthBackReferencesOnly(r)) {
     123        llvm::report_fatal_error("Future back reference support: references must be within a fixed distance from a fixed-length capture.");
     124    }
    121125    return r;
    122126}
Note: See TracChangeset for help on using the changeset viewer.