Changeset 4393 for icGREP


Ignore:
Timestamp:
Jan 3, 2015, 6:17:16 AM (5 years ago)
Author:
cameron
Message:

Fixed byte length/unicode unit length analyzers; abstract/generalize compiler special cases

Location:
icGREP/icgrep-devel/icgrep
Files:
2 added
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r4390 r4393  
    5454
    5555add_library(PabloADT pablo/pe_and.cpp pablo/pe_not.cpp pablo/pe_or.cpp  pablo/pabloAST.cpp  pablo/pe_sel.cpp  pablo/pe_xor.cpp pablo/codegenstate.cpp  pablo/symbol_generator.cpp pablo/analysis/useanalysis.cpp pablo/printer_pablos.cpp pablo/pablo_compiler.cpp)
    56 add_library(RegExpADT re/re_re.cpp re/re_cc.cpp re/re_parser.cpp re/re_rep.cpp re/parsefailure.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/printer_re.cpp re/re_diff.cpp re/re_intersect.cpp)
     56add_library(RegExpADT re/re_re.cpp re/re_cc.cpp re/re_parser.cpp re/re_rep.cpp re/parsefailure.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/printer_re.cpp re/re_diff.cpp re/re_intersect.cpp re/re_analysis.cpp)
    5757add_library(CCADT cc/cc_namemap.cpp cc/cc_compiler.cpp utf_encoding.cpp utf8_encoder.cpp UCD/CaseFolding_txt.cpp)
    5858add_library(UCDlib UCD/unicode_set.cpp)
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4390 r4393  
    1717#include <re/re_diff.h>
    1818#include <re/re_intersect.h>
     19#include <re/re_analysis.h>
    1920#include <cc/cc_namemap.hpp>
    2021#include <pablo/codegenstate.h>
     
    267268    RE * lh = diff->getLH();
    268269    RE * rh = diff->getRH();
    269     if ((isa<Any>(lh) || isa<Name>(lh)) && (isa<Any>(rh) || isa<Name>(rh))) {
     270    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
    270271        MarkerType t1 = process(lh, marker, pb);
    271272        MarkerType t2 = process(rh, marker, pb);
     
    279280    RE * lh = x->getLH();
    280281    RE * rh = x->getRH();
    281     if ((isa<Any>(lh) || isa<Name>(lh)) && (isa<Any>(rh) || isa<Name>(rh))) {
     282    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
    282283        MarkerType t1 = process(lh, marker, pb);
    283284        MarkerType t2 = process(rh, marker, pb);
     
    324325}
    325326               
    326 inline bool RE_Compiler::isFixedLength(RE * regexp) {
    327     return isa<Name>(regexp) && ((cast<Name>(regexp)->getType()) == Name::Type::Byte);
    328 }
    329 
    330327MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBlock & pb) {
    331     if (isFixedLength(repeated)) {
    332         Name * name = cast<Name>(repeated);
    333         Assign * cc_lb = consecutive(pb.createAssign("repeated", pb.createAdvance(name->getCompiled(),1)), 1, lb, pb);
     328    if (isByteLength(repeated)) {
     329        PabloAST * cc = markerVar(compile(repeated, pb), pb);
     330        Assign * cc_lb = consecutive(pb.createAssign("repeated", pb.createAdvance(cc,1)), 1, lb, pb);
    334331        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker, pb), isFinalPositionMarker(marker) ? lb+ 1 : lb);
    335332        return makePostPositionMarker("lowerbound", pb.createAnd(marker_fwd, pb.createVar(cc_lb)), pb);
     
    343340
    344341MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBlock & pb) {
    345     if (isFixedLength(repeated)) {
     342    if (isByteLength(repeated)) {
    346343        // log2 upper bound for fixed length (=1) class
    347344        // Mask out any positions that are more than ub positions from a current match.
     
    349346        Assign * nonMatch = pb.createAssign("nonmatch", pb.createNot(postPositionVar(marker, pb)));
    350347        PabloAST * upperLimitMask = pb.createNot(pb.createVar(consecutive(nonMatch, 1, ub + 1, pb)));
    351         PabloAST * rep_class_var = cast<Name>(repeated)->getCompiled();
     348        PabloAST * rep_class_var = markerVar(compile(repeated, pb), pb);
    352349        return makePostPositionMarker("bounded", pb.createAnd(pb.createMatchStar(postPositionVar(marker, pb), rep_class_var), upperLimitMask), pb);
    353350    }
     
    369366    PabloAST * base = postPositionVar(marker, pb);
    370367   
    371     if (isa<Name>(repeated)) {
    372         Name * name = cast<Name>(repeated);
    373         PabloAST * cc = character_class_strm(name, pb);
    374         if (name->getType() == Name::Type::Byte) {
    375             return makePostPositionMarker("unbounded", pb.createMatchStar(base, cc), pb);
    376         }
    377         else { // Name::Unicode and Name::UnicodeProperty
    378             return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial), pb);
    379         }       
    380     }
    381     else if (isa<Any>(repeated)) {
    382         PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed);
    383         return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, dot)), mInitial), pb);
    384     }
    385     else if (isa<Diff>(repeated) && isa<Any>(cast<Diff>(repeated)->getLH()) && isa<Name>(cast<Diff>(repeated)->getRH())) {
    386         Name * name = cast<Name>(cast<Diff>(repeated)->getRH());
    387         PabloAST * cc = pb.createNot(pb.createOr(character_class_strm(name, pb), mLineFeed));
     368    if (isByteLength(repeated)) {
     369        PabloAST * cc = markerVar(compile(repeated, pb), pb);
     370        return makePostPositionMarker("unbounded", pb.createMatchStar(base, cc), pb);
     371    }
     372    else if (isUnicodeUnitLength(repeated)) {
     373        PabloAST * cc = markerVar(compile(repeated, pb), pb);
    388374        return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial), pb);
    389375    }
  • icGREP/icgrep-devel/icgrep/resolve_properties.cpp

    r4391 r4393  
    182182                    return;
    183183                }
     184                // Now compatibility properties of UTR #18 Annex C
     185                else if (v == "xdigit") {
     186                    re::Name * Nd = re::makeName("Nd", Name::Type::UnicodeProperty);
     187                    resolveProperties(Nd);
     188                    re::Name * hexdigit = re::makeName("Hex_digit", Name::Type::UnicodeProperty);
     189                    resolveProperties(hexdigit);
     190                    std::vector<RE *> alts = {Nd, hexdigit};
     191                    name->setDefinition(re::makeAlt(alts.begin(), alts.end()));
     192                    return;
     193                }
     194                else if (v == "alnum") {
     195                    re::Name * digit = re::makeName("Nd", Name::Type::UnicodeProperty);
     196                    resolveProperties(digit);
     197                    re::Name * alpha = re::makeName("alphabetic", Name::Type::UnicodeProperty);
     198                    resolveProperties(alpha);
     199                    std::vector<RE *> alts = {digit, alpha};
     200                    name->setDefinition(re::makeAlt(alts.begin(), alts.end()));
     201                    return;
     202                }
     203                else if (v == "blank") {
     204                    re::Name * space_sep = re::makeName("space_separator", Name::Type::UnicodeProperty);
     205                    resolveProperties(space_sep);
     206                    re::CC * tab = re::makeCC(0x09);
     207                    std::vector<RE *> alts = {space_sep, tab};
     208                    name->setDefinition(re::makeAlt(alts.begin(), alts.end()));
     209                    return;
     210                }
     211                else if (v == "graph") {
     212                    re::Name * space = re::makeName("space", Name::Type::UnicodeProperty);
     213                    resolveProperties(space);
     214                    re::Name * ctrl = re::makeName("control", Name::Type::UnicodeProperty);
     215                    resolveProperties(ctrl);
     216                    re::Name * surr = re::makeName("surrogate", Name::Type::UnicodeProperty);
     217                    resolveProperties(surr);
     218                    re::Name * unassigned = re::makeName("Cn", Name::Type::UnicodeProperty);
     219                    resolveProperties(unassigned);
     220                    std::vector<RE *> alts = {space, ctrl, surr, unassigned};
     221                    re::Name * nongraph = re::makeName("[^graph]", Name::Type::UnicodeProperty);
     222                    nongraph->setDefinition(re::makeAlt(alts.begin(), alts.end()));
     223                    name->setDefinition(re::makeDiff(re::makeAny(), nongraph));
     224                    return;
     225                }
     226                else if (v == "print") {
     227                    re::Name * graph = re::makeName("graph", Name::Type::UnicodeProperty);
     228                    resolveProperties(graph);
     229                    re::Name * space_sep = re::makeName("space_separator", Name::Type::UnicodeProperty);
     230                    resolveProperties(space_sep);
     231                    std::vector<RE *> alts = {graph, space_sep};
     232                    name->setDefinition(re::makeAlt(alts.begin(), alts.end()));
     233                    return;
     234                }
     235                else if (v == "word") {
     236                    re::Name * alnum = re::makeName("alnum", Name::Type::UnicodeProperty);
     237                    resolveProperties(alnum);
     238                    re::Name * mark = re::makeName("mark", Name::Type::UnicodeProperty);
     239                    resolveProperties(mark);
     240                    re::Name * conn = re::makeName("Connector_Punctuation", Name::Type::UnicodeProperty);
     241                    resolveProperties(conn);
     242                    re::Name * join = re::makeName("Join_Control", Name::Type::UnicodeProperty);
     243                    resolveProperties(join);
     244                    std::vector<RE *> alts = {alnum,mark,conn,join};
     245                    name->setDefinition(re::makeAlt(alts.begin(), alts.end()));
     246                    return;
     247                }
    184248                else {
    185249                    throw UnicodePropertyExpressionError("Expected a general category, script or binary property name, but '" + name->getName() + "' found instead");
Note: See TracChangeset for help on using the changeset viewer.