Changeset 5682 for icGREP


Ignore:
Timestamp:
Oct 9, 2017, 3:24:00 PM (17 months ago)
Author:
cameron
Message:

String property regular expression support, including special cases for null and reflexive sets

Location:
icGREP/icgrep-devel/icgrep
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/PropertyObjects.cpp

    r5679 r5682  
    1616#include <toolchain/grep_pipeline.h>
    1717#include <util/aligned_allocator.h>
    18 #include <re/re_nullable.h>
     18#include <re/re_analysis.h>
    1919using namespace llvm;
    2020
     
    293293const UnicodeSet NumericPropertyObject::GetCodepointSetMatchingPattern(re::RE * pattern) {
    294294    UnicodeSet matched;
    295     llvm::report_fatal_error("NumericPropertyObject NaN matching issue!");
     295    // TODO:  Should we allow matches to NaN???
    296296    SetByLineNumberAccumulator accum(mExplicitCps);
    297297    grepBuffer(pattern, mStringBuffer, mBufSize, & accum);
     
    328328
    329329const UnicodeSet StringPropertyObject::GetCodepointSetMatchingPattern(re::RE * pattern) {
    330     UnicodeSet matched;
    331     if (re::RE_Nullable::isNullable(pattern)) {
     330    UnicodeSet matched = *cast<UnicodeSet>(matchableCodepoints(pattern)) & mSelfCodepointSet;
     331    if (re::matchesEmptyString(pattern)) {
    332332        matched = matched + mNullCodepointSet;
    333333    }
    334     //llvm::report_fatal_error("StringPropertyObject reflexive set issue!");
    335334    SetByLineNumberAccumulator accum(mExplicitCps);
    336335    grepBuffer(pattern, mStringBuffer, mBufSize, & accum);
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r5649 r5682  
    11#include "re_analysis.h"
     2#include <UCD/unicode_set.h>
    23#include <re/re_cc.h>
    34#include <re/re_name.h>
     
    1920
    2021namespace re {
     22   
     23bool matchesEmptyString(const RE * re) {
     24    if (const Alt * alt = dyn_cast<Alt>(re)) {
     25        for (const RE * re : *alt) {
     26            if (matchesEmptyString(re)) {
     27                return true;
     28            }
     29        }
     30        return false;
     31    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
     32        for (const RE * re : *seq) {
     33            if (!matchesEmptyString(re)) {
     34                return false;
     35            }
     36        }
     37        return true;
     38    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
     39        return (rep->getLB() == 0) || matchesEmptyString(rep->getRE());
     40    } else if (isa<Start>(re)) {
     41        return true;
     42    } else if (isa<End>(re)) {
     43        return true;
     44    } else if (isa<Assertion>(re)) {
     45        return false;
     46    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
     47        return matchesEmptyString(diff->getLH()) && !matchesEmptyString(diff->getRH());
     48    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
     49        return matchesEmptyString(e->getLH()) && matchesEmptyString(e->getRH());
     50    } else if (isa<Any>(re)) {
     51        return false;
     52    } else if (isa<CC>(re)) {
     53        return false;
     54    } else if (const Name * n = dyn_cast<Name>(re)) {
     55        return matchesEmptyString(n->getDefinition());
     56    }
     57    return false; // otherwise
     58}
     59
     60const CC* matchableCodepoints(const RE * re) {
     61    if (const CC * cc = dyn_cast<CC>(re)) {
     62        return cc;
     63    } else if (const Alt * alt = dyn_cast<Alt>(re)) {
     64        CC * matchable = makeCC();
     65        for (const RE * re : *alt) {
     66            matchable = makeCC(matchable, matchableCodepoints(re));
     67        }
     68        return matchable;
     69    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
     70        CC * matchable = makeCC();
     71        bool pastCC = false;
     72        for (const RE * re : *seq) {
     73            if (pastCC) {
     74                if (!(isa<End>(re) || matchesEmptyString(re))) return makeCC();
     75            }
     76            else if (isa<End>(re)) return makeCC();
     77            else {
     78                matchable = makeCC(matchable, matchableCodepoints(re));
     79                pastCC = !matchesEmptyString(re);
     80            }
     81        }
     82        return matchable;
     83    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
     84        if ((rep->getLB() <= 1) || matchesEmptyString(rep->getRE())) {
     85            return matchableCodepoints(rep->getRE());
     86        }
     87        else return makeCC();
     88    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
     89        return subtractCC(matchableCodepoints(diff->getLH()), matchableCodepoints(diff->getRH()));
     90    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
     91        return intersectCC(matchableCodepoints(diff->getLH()), matchableCodepoints(diff->getRH()));
     92    } else if (isa<Any>(re)) {
     93        return makeCC(0, 0x10FFFF);
     94    } else if (const Name * n = dyn_cast<Name>(re)) {
     95        return matchableCodepoints(n->getDefinition());
     96    }
     97    return makeCC(); // otherwise = Start, End, Assertion
     98}
     99
     100
    21101
    22102bool isByteLength(const RE * re) {
  • icGREP/icgrep-devel/icgrep/re/re_analysis.h

    r5649 r5682  
    33
    44#include <utility>
    5 namespace re { class RE; class Name;}
     5namespace re { class RE; class Name; class CC;}
    66
    77namespace re {
     8
     9// Does the RE match the empty string, considering that ^ and $ each
     10// do match an empty string.
     11bool matchesEmptyString(const RE * re);
     12
     13//  Determine the set of all codepoints cp such that the given RE
     14//  matches a string consisting of a single Unicode character whose
     15//  codepoint value is cp.
     16const CC * matchableCodepoints(const RE * re);
    817
    918bool isByteLength(const RE * re);
Note: See TracChangeset for help on using the changeset viewer.