Changeset 4841


Ignore:
Timestamp:
Oct 17, 2015, 4:25:05 PM (2 years ago)
Author:
nmedfort
Message:

Update for grapheme cluster mode and boundaries.

Location:
icGREP/icgrep-devel/icgrep
Files:
14 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.cpp

    r4819 r4841  
    452452    addTargets(names);
    453453    generateRange(defaultIfHierachy, entry);
    454     updateNames(names);
     454    updateNames(names, entry);
    455455}
    456456
     
    471471    addTargets(names);
    472472    generateRange(noIfHierachy, entry);
    473     updateNames(names);
     473    updateNames(names, entry);
    474474}
    475475
     
    488488inline void UCDCompiler::addTargets(const NameMap & names) {
    489489    for (const auto t : names) {
    490         if (isa<CC>(t.first->getDefinition())) {
     490        if (LLVM_LIKELY(isa<CC>(t.first->getDefinition()))) {
    491491            mTargetMap.emplace(cast<CC>(t.first->getDefinition()), t.second ? t.second : PabloBlock::createZeroes());
    492492        } else {
     
    500500 * @brief updateNames
    501501 ** ------------------------------------------------------------------------------------------------------------- */
    502 inline void UCDCompiler::updateNames(NameMap & names) {
     502inline void UCDCompiler::updateNames(NameMap & names, PabloBuilder & entry) {
    503503    for (auto & t : names) {
    504504        auto f = mTargetMap.find(cast<CC>(t.first->getDefinition()));
    505         t.second = f->second;
     505        if (f != mTargetMap.end()) {
     506            std::string name = t.first->getName();
     507            if (Statement * result = dyn_cast<Statement>(f->second)) {
     508                result->setName(entry.getName(name, false));
     509                t.second = result;
     510            } else {
     511                t.second = entry.createAssign(std::move(name), f->second);
     512            }
     513        }
    506514    }
    507515    mTargetMap.clear();
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.hpp

    r4814 r4841  
    9494    void addTargets(const NameMap & names);
    9595
    96     void updateNames(NameMap & names);
     96    void updateNames(NameMap & names, PabloBuilder & entry);
    9797
    9898private:
  • icGREP/icgrep-devel/icgrep/pablo/optimizers/pablo_simplifier.cpp

    r4835 r4841  
    2222#endif
    2323
     24/** ------------------------------------------------------------------------------------------------------------- *
     25 * @brief optimize
     26 ** ------------------------------------------------------------------------------------------------------------- */
    2427bool Simplifier::optimize(PabloFunction & function) {
    2528    eliminateRedundantCode(function.getEntryBlock());
     
    3235}
    3336
     37/** ------------------------------------------------------------------------------------------------------------- *
     38 * @brief canTriviallyFold
     39 ** ------------------------------------------------------------------------------------------------------------- */
    3440inline static PabloAST * canTriviallyFold(Statement * stmt, PabloBlock & block) {
    3541    for (unsigned i = 0; i != stmt->getNumOperands(); ++i) {
     
    8187}
    8288
     89/** ------------------------------------------------------------------------------------------------------------- *
     90 * @brief isSuperfluous
     91 ** ------------------------------------------------------------------------------------------------------------- */
    8392inline bool Simplifier::isSuperfluous(const Assign * const assign) {
    8493    for (const PabloAST * inst : assign->users()) {
     
    105114}
    106115
     116/** ------------------------------------------------------------------------------------------------------------- *
     117 * @brief demoteDefinedVar
     118 ** ------------------------------------------------------------------------------------------------------------- */
    107119inline bool demoteDefinedVar(const If * ifNode, const Assign * def) {
    108120    // If this value isn't used outside of this scope then there is no reason to allow it to escape it.
     
    121133}
    122134
     135/** ------------------------------------------------------------------------------------------------------------- *
     136 * @brief replaceReachableUsersOfWith
     137 ** ------------------------------------------------------------------------------------------------------------- */
    123138inline void replaceReachableUsersOfWith(Statement * stmt, PabloAST * expr) {
    124139    const PabloBlock * const root = stmt->getParent();
     
    146161}
    147162
     163/** ------------------------------------------------------------------------------------------------------------- *
     164 * @brief discardNestedIfBlock
     165 *
     166 * If this inner block is composed of only Boolean logic and Assign statements and there are fewer than 3
     167 * statements, just add the statements in the inner block to the current block.
     168 ** ------------------------------------------------------------------------------------------------------------- */
     169inline bool discardNestedIfBlock(const PabloBlock & pb) {
     170    unsigned computations = 0;
     171    for (const Statement * stmt : pb) {
     172        switch (stmt->getClassTypeId()) {
     173            case PabloAST::ClassTypeId::And:
     174            case PabloAST::ClassTypeId::Or:
     175            case PabloAST::ClassTypeId::Xor:
     176                if (++computations > 3) {
     177                    return false;
     178                }
     179            case PabloAST::ClassTypeId::Not:
     180            case PabloAST::ClassTypeId::Assign:
     181                break;
     182            default:
     183                return false;
     184        }
     185    }
     186    return true;
     187}
     188
     189/** ------------------------------------------------------------------------------------------------------------- *
     190 * @brief removeIdenticalEscapedValues
     191 ** ------------------------------------------------------------------------------------------------------------- */
    148192template <class ValueList>
    149193inline void removeIdenticalEscapedValues(ValueList & list) {
     
    161205}
    162206
     207/** ------------------------------------------------------------------------------------------------------------- *
     208 * @brief eliminateRedundantCode
     209 ** ------------------------------------------------------------------------------------------------------------- */
    163210void Simplifier::eliminateRedundantCode(PabloBlock & block, ExpressionTable * predecessor) {
    164211    ExpressionTable encountered(predecessor);
     
    211258                continue;
    212259            }
     260
    213261            // Otherwise check if we any Assign reports the same value as another. If so, replace all uses of the
    214262            // second with the first. This will simplify future analysis.
    215263            removeIdenticalEscapedValues(ifNode->getDefined());
     264
     265            // Finally after we've eliminated everything we can from the If body, check whether testing the If
     266            // condition will meet or exceed the cost of executing the body.
     267            if (LLVM_UNLIKELY(discardNestedIfBlock(ifNode->getBody()))) {
     268                Statement * nested = ifNode->getBody().front();
     269                while (nested) {
     270                    Statement * next = nested->removeFromParent();
     271                    nested->insertAfter(stmt);
     272                    stmt = nested;
     273                    nested = next;
     274                }
     275                ifNode->getDefined().clear();
     276                stmt = ifNode->eraseFromParent(true);
     277                continue;
     278            }
    216279
    217280        } else if (While * whileNode = dyn_cast<While>(stmt)) {
     
    244307}
    245308
    246 
     309/** ------------------------------------------------------------------------------------------------------------- *
     310 * @brief deadCodeElimination
     311 ** ------------------------------------------------------------------------------------------------------------- */
    247312void Simplifier::deadCodeElimination(PabloBlock & block) {
    248313    Statement * stmt = block.front();
     
    260325}
    261326
     327/** ------------------------------------------------------------------------------------------------------------- *
     328 * @brief eliminateRedundantEquations
     329 ** ------------------------------------------------------------------------------------------------------------- */
    262330void Simplifier::eliminateRedundantEquations(PabloBlock & block) {
    263331    Statement * stmt = block.front();
  • icGREP/icgrep-devel/icgrep/pablo/pabloAST.h

    r4775 r4841  
    213213        return mName;
    214214    }
     215    inline void setName(const String * const name) {
     216        mName = name;
     217    }
    215218    inline Statement * getNextNode() const {
    216219        return mNext;
     
    239242            }
    240243        }
    241     }
    242     inline void setName(const String * const name) {
    243         mName = name;
    244     }   
     244    } 
    245245#ifndef NDEBUG
    246246    bool noRecursiveOperand(const PabloAST * const operand);
  • icGREP/icgrep-devel/icgrep/pablo/ps_if.h

    r4764 r4841  
    4444        return mDefined;
    4545    }
    46     void addDefined(Assign * def);
    47 protected:
    4846    inline DefinedVars & getDefined() {
    4947        return mDefined;
    5048    }
     49    void addDefined(Assign * def);
     50protected:
    5151    If(PabloAST * expr, const std::initializer_list<Assign *> definedVars, PabloBlock & body);
    52 
    5352    If(PabloAST * expr, const std::vector<Assign *> & definedVars, PabloBlock & body);
    5453private:
  • icGREP/icgrep-devel/icgrep/re/printer_re.cpp

    r4829 r4841  
    8686        retVal += ") ";
    8787    }
    88     else if (const GraphemeBoundary * g = dyn_cast<GraphemeBoundary>(re))
    89     {
     88    else if (const GraphemeBoundary * g = dyn_cast<GraphemeBoundary>(re)) {
    9089        retVal = "Grapheme";
    9190        switch (g->getType()) {
     
    10099        }
    101100        retVal += "Boundary(";
    102         retVal += PrintRE(g->getExpression());
    103         retVal += ") ";
     101        if (g->getExpression()) {
     102            retVal += PrintRE(g->getExpression());
     103        }
     104        retVal += ")";
    104105    }
    105106    else if (isa<const End>(re))
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r4835 r4841  
    6363        // Eventually names might be set up for not unit length items.
    6464        return (n->getType() == Name::Type::Unicode || n->getType() == Name::Type::UnicodeProperty || n->getType() == Name::Type::Byte);
    65     } else if (const GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    66         return isUnicodeUnitLength(gb->getExpression());
    6765    }
    6866    return false; // otherwise
     
    127125                return std::make_pair(0, std::numeric_limits<int>::max());
    128126        }
    129     } else if (const GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    130         return std::make_pair(getUnicodeUnitLengthRange(gb->getExpression()).first, std::numeric_limits<int>::max());
     127    } else if (const GraphemeBoundary * gp = dyn_cast<GraphemeBoundary>(re)) {
     128        if (gp->getExpression()) {
     129            return getUnicodeUnitLengthRange(gp->getExpression());
     130        }
     131        return std::make_pair(0, 0);
    131132    }
    132133    return std::make_pair(1, 1);
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4835 r4841  
    174174                            const UCD::ExternalProperty & ep = UCD::resolveExternalProperty(functionName);
    175175                            Call * call = mPB.createCall(Prototype::Create(functionName, std::get<1>(ep), std::get<2>(ep), std::get<0>(ep)), mCCCompiler.getBasisBits());
    176                             name->setCompiled(mPB.createAnd(call, mAny));
     176                            name->setCompiled(call);
    177177                        } else {
    178178                        #endif
     
    241241            }
    242242        } else if (GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    243             if (LLVM_LIKELY(gb->getGraphemeExtenderRule() == nullptr)) {
     243            if (LLVM_LIKELY(gb->getBoundaryRule() == nullptr)) {
    244244                switch (gb->getType()) {
    245245                    case GraphemeBoundary::Type::ClusterBoundary:
    246246                        if (graphemeClusterRule == nullptr) {
    247                             graphemeClusterRule = cast<Name>(resolve(generateGraphemeClusterExtenderRule()));
     247                            graphemeClusterRule = cast<Name>(resolve(generateGraphemeClusterBoundaryRule()));
    248248                        }
    249249                        gb->setBoundaryRule(graphemeClusterRule);
     
    253253                }
    254254            }
    255             gb->setExpression(resolve(gb->getExpression()));
     255            if (gb->getExpression()) {
     256                resolve(gb->getExpression());
     257            }
    256258        }
    257259        return re;
     
    261263    std::unordered_set<Name *> visited;
    262264
    263     std::function<void(RE*)> gather = [&](RE * re) {
     265    std::function<void(RE*)> gather = [&](RE * re) {       
    264266        if (Name * name = dyn_cast<Name>(re)) {
    265267            if (visited.insert(name).second) {
     
    289291            gather(ix->getRH());
    290292        } else if (GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    291             gather(gb->getExpression());
    292             gather(gb->getGraphemeExtenderRule());
     293            if (gb->getExpression()) {
     294                gather(gb->getExpression());
     295            }
     296            gather(gb->getBoundaryRule());
    293297        }
    294298    };
     
    302306        for (auto t : nameMap) {
    303307            if (t.second) {
    304                 mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, mPB.createAnd(t.second, mAny))));
     308                mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, t.second)));
    305309            }
    306310        }
     
    309313    // Now precompile any grapheme segmentation rules
    310314    if (graphemeClusterRule) {
    311         mCompiledName.insert(std::make_pair(graphemeClusterRule, compileName(graphemeClusterRule, mPB)));
     315        auto gcb = compileName(graphemeClusterRule, mPB);
     316        mCompiledName.insert(std::make_pair(graphemeClusterRule, gcb));
    312317    }
    313318    return re;
     
    318323}
    319324
    320 Name * RE_Compiler::generateGraphemeClusterExtenderRule() {
     325Name * RE_Compiler::generateGraphemeClusterBoundaryRule() {
    321326    // 3.1.1 Grapheme Cluster Boundary Rules
    322327    #define Behind(x) makeLookBehindAssertion(x)
     
    324329
    325330    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
    326 
     331    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
     332    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
     333
     334    // Break at the start and end of text.
    327335    RE * GCB_1 = makeStart();
    328336    RE * GCB_2 = makeEnd();
     337    // Do not break between a CR and LF.
     338    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
     339    // Otherwise, break before and after controls.
    329340    RE * GCB_4 = Behind(GCB_Control);
    330341    RE * GCB_5 = Ahead(GCB_Control);
    331 
    332     RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeSeq({GCB_4, GCB_5})});
     342    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeSeq({GCB_4, GCB_5}), GCB_3)});
    333343
    334344    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
     
    338348    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
    339349    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
    340     // Legacy rules
     350    // Do not break Hangul syllable sequences.
    341351    RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
    342352    RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
    343353    RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
     354    // Do not break between regional indicator symbols.
    344355    RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
     356    // Do not break before extending characters.
    345357    RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
    346     // Extended rules
     358    // Do not break before SpacingMarks, or after Prepend characters.
    347359    RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
    348360    RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
    349 
    350361    RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
     362    // Otherwise, break everywhere.
     363    RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
     364
    351365    Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    352     gcb->setDefinition(makeDiff(GCB_6_9b,  GCB_1_5));
    353 
     366    gcb->setDefinition(makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)}));
    354367    return gcb;
    355368}
     
    550563
    551564inline PabloAST * RE_Compiler::consecutive_matches(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
    552         int i = length;
    553         int total = repeat_count * length;
    554         PabloAST * consecutive_i = repeated;
    555         while (i * 2 < total) {
    556             PabloAST * v = consecutive_i;
    557             PabloAST * v2 =  pb.createAdvance(v, i);
    558             i *= 2;
    559             consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
    560         }       
    561         if (i < total) {
    562             PabloAST * v = consecutive_i;
    563             consecutive_i = pb.createAnd(v, pb.createAdvance(v, total - i), "at" + std::to_string(total));
    564         }
    565         return consecutive_i;
    566 }
    567 
    568 inline PabloAST * RE_Compiler::reachable(PabloAST *repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
    569         int i = repeated_lgth;
    570         int total_lgth = repeat_count * repeated_lgth;
    571         if (repeat_count == 0) {
    572             return repeated;
    573         }
    574         PabloAST * reachable_i = pb.createOr(repeated, pb.createAdvance(repeated, 1), "within1");
    575         while (i * 2 < total_lgth) {
    576             PabloAST * v = reachable_i;
    577             PabloAST * v2 =  pb.createAdvance(v, i);
    578             i *= 2;
    579             reachable_i = pb.createOr(v, v2, "within" + std::to_string(i));
    580         }       
    581         if (i < total_lgth) {
    582             PabloAST * v = reachable_i;
    583             reachable_i = pb.createOr(v, pb.createAdvance(v, total_lgth - i), "within" + std::to_string(total_lgth));
    584         }
    585         return reachable_i;
     565    int i = length;
     566    int total = repeat_count * length;
     567    PabloAST * consecutive_i = repeated;
     568    while (i * 2 < total) {
     569        PabloAST * v = consecutive_i;
     570        PabloAST * v2 =  pb.createAdvance(v, i);
     571        i *= 2;
     572        consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
     573    }
     574    if (i < total) {
     575        PabloAST * v = consecutive_i;
     576        consecutive_i = pb.createAnd(v, pb.createAdvance(v, total - i), "at" + std::to_string(total));
     577    }
     578    return consecutive_i;
     579}
     580
     581inline PabloAST * RE_Compiler::reachable(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
     582    int i = length;
     583    int total_lgth = repeat_count * length;
     584    if (repeat_count == 0) {
     585        return repeated;
     586    }
     587    PabloAST * reachable_i = pb.createOr(repeated, pb.createAdvance(repeated, 1), "within1");
     588    while (i * 2 < total_lgth) {
     589        PabloAST * v = reachable_i;
     590        PabloAST * v2 =  pb.createAdvance(v, i);
     591        i *= 2;
     592        reachable_i = pb.createOr(v, v2, "within" + std::to_string(i));
     593    }
     594    if (i < total_lgth) {
     595        PabloAST * v = reachable_i;
     596        reachable_i = pb.createOr(v, pb.createAdvance(v, total_lgth - i), "within" + std::to_string(total_lgth));
     597    }
     598    return reachable_i;
    586599}
    587600
     
    633646        }
    634647        return makeMarker(MarkerPosition::InitialPostPositionByte, mstar);
    635     }
    636     else if (isUnicodeUnitLength(repeated) && !DisableMatchStar && !DisableUnicodeMatchStar) {
     648    } else if (isUnicodeUnitLength(repeated) && !DisableMatchStar && !DisableUnicodeMatchStar) {
    637649        PabloAST * cc = markerVar(compile(repeated, pb));
    638650        PabloAST * mstar = nullptr;
    639651        PabloAST * nonFinal = mNonFinal;
    640         if (mGraphemeExtenderRule) {
    641             nonFinal = pb.createOr(nonFinal, mGraphemeExtenderRule);
     652        if (mGraphemeBoundaryRule) {
     653            nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
    642654        }
    643655        cc = pb.createOr(cc, nonFinal);
     
    648660        }
    649661        PabloAST * final = mFinal;
    650         if (mGraphemeExtenderRule) {
    651             final = pb.createOr(final, pb.createNot(mGraphemeExtenderRule));
     662        if (mGraphemeBoundaryRule) {
     663            final = mGraphemeBoundaryRule;
    652664        }
    653665        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(mstar, final, "unbounded"));
     
    705717    if (UNICODE_LINE_BREAK) {
    706718        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
    707         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
     719        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "eol"));
    708720    } else {
    709721        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
     
    712724}
    713725
    714 inline MarkerType RE_Compiler::compileGraphemeBoundary(GraphemeBoundary * gb, const MarkerType marker, pablo::PabloBuilder & pb) {
    715     const auto inGraphemeBoundaryRule = mGraphemeExtenderRule;
    716     auto f = mCompiledName.find(gb->getGraphemeExtenderRule());
    717     if (LLVM_UNLIKELY(f == mCompiledName.end())) {
    718         throw std::runtime_error("Internal error: failed to locate grapheme boundary rule!");
    719     }
    720     mGraphemeExtenderRule = markerVar(f->second);
    721     assert (mGraphemeExtenderRule);
    722     MarkerType result = process(gb->getExpression(), marker, pb);
    723     mGraphemeExtenderRule = inGraphemeBoundaryRule;
    724     return result;
    725 }
    726 
    727 inline MarkerType RE_Compiler::AdvanceMarker(const MarkerType m, const MarkerPosition newpos, PabloBuilder & pb) {
    728     if (m.pos == newpos) return m;
    729     PabloAST * a = m.stream;
    730     if (m.pos == MarkerPosition::FinalMatchByte) {
    731         // Must advance the previous marker to the InitialPostPositionByte
    732         a = pb.createAdvance(a, 1, "ipp");
    733     }
    734     // Now at InitialPostPositionByte; is a further advance needed?
    735     if (newpos == MarkerPosition::FinalPostPositionByte) {
    736         // Must advance through nonfinal bytes
    737         PabloAST * nonFinal = mNonFinal;
    738         if (mGraphemeExtenderRule) {
    739             nonFinal = pb.createOr(nonFinal, mGraphemeExtenderRule, "gext");
    740         }
    741         a = pb.createScanThru(pb.createAnd(mInitial, a), nonFinal, "fpp");
    742     }
    743     return {newpos, a};
     726inline MarkerType RE_Compiler::compileGraphemeBoundary(GraphemeBoundary * gb, MarkerType marker, pablo::PabloBuilder & pb) {
     727    auto f = mCompiledName.find(gb->getBoundaryRule());
     728    assert ("Internal error: failed to locate grapheme boundary rule!" && (f != mCompiledName.end()));
     729    if (gb->getExpression()) {
     730        const auto graphemeBoundaryRule = mGraphemeBoundaryRule;
     731        mGraphemeBoundaryRule = markerVar(f->second);
     732        marker = process(gb->getExpression(), marker, pb);
     733        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     734        mGraphemeBoundaryRule = graphemeBoundaryRule;
     735    } else {
     736        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     737        PabloAST * rule = markerVar(f->second);
     738        if (gb->getSense() == GraphemeBoundary::Sense::Negative) {
     739            rule = pb.createNot(rule);
     740        }
     741        marker = makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(marker), rule, "gb"));
     742    }
     743    return marker;
     744}
     745
     746inline MarkerType RE_Compiler::AdvanceMarker(MarkerType marker, const MarkerPosition newpos, PabloBuilder & pb) {
     747    if (marker.pos != newpos) {
     748        if (marker.pos == MarkerPosition::FinalMatchByte) {
     749            marker.stream = pb.createAdvance(marker.stream, 1, "ipp");
     750            marker.pos = MarkerPosition::InitialPostPositionByte;
     751        }
     752        if (newpos == MarkerPosition::FinalPostPositionByte) {
     753            PabloAST * nonFinal = mNonFinal;
     754            if (mGraphemeBoundaryRule) {
     755                nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
     756            }
     757            marker.stream = pb.createScanThru(pb.createAnd(mInitial, marker.stream), nonFinal, "fpp");
     758            marker.pos = MarkerPosition::FinalPostPositionByte;
     759        }
     760    }
     761    return marker;
    744762}
    745763
     
    758776, mUnicodeLineBreak(nullptr)
    759777, mAny(nullptr)
    760 , mGraphemeExtenderRule(nullptr)
     778, mGraphemeBoundaryRule(nullptr)
    761779, mInitial(nullptr)
    762780, mNonFinal(nullptr)
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r4833 r4841  
    6363
    6464    MarkerType compile(RE * re, pablo::PabloBuilder & cg);
    65     MarkerType AdvanceMarker(const MarkerType m, const MarkerPosition newpos, pablo::PabloBuilder & pb);
    66    
    67     void AlignMarkers(MarkerType & m1, MarkerType & m2, pablo::PabloBuilder & pb);
    68    
     65
    6966    MarkerType process(RE * re, MarkerType marker, pablo::PabloBuilder & pb);
    7067    MarkerType compileName(Name * name, MarkerType marker, pablo::PabloBuilder & pb);
     
    7774    MarkerType compileIntersect(Intersect * x, MarkerType marker, pablo::PabloBuilder & cg);
    7875    pablo::PabloAST * consecutive_matches(pablo::PabloAST * repeated,  int length, int repeat_count, pablo::PabloBuilder & pb);
    79     pablo::PabloAST * reachable(pablo::PabloAST * repeated,  int repeated_lgth, int repeat_count, pablo::PabloBuilder & pb);
     76    pablo::PabloAST * reachable(pablo::PabloAST * repeated,  int length, int repeat_count, pablo::PabloBuilder & pb);
    8077    static bool isFixedLength(RE * regexp);
    8178    MarkerType processLowerBound(RE * repeated,  int lb, MarkerType marker, pablo::PabloBuilder & pb);
     
    8481    RE * resolveUnicodeProperties(RE * re);
    8582
    86     Name * generateGraphemeClusterExtenderRule();
     83    Name * generateGraphemeClusterBoundaryRule();
    8784    MarkerType compileName(Name * name, pablo::PabloBuilder & pb);
    8885    MarkerType compileAny(const MarkerType m, pablo::PabloBuilder & pb);
    8986    MarkerType compileStart(const MarkerType marker, pablo::PabloBuilder & pb);
    9087    MarkerType compileEnd(const MarkerType marker, pablo::PabloBuilder & pb);
    91     MarkerType compileGraphemeBoundary(GraphemeBoundary *gb, const MarkerType marker, pablo::PabloBuilder & pb);
     88    MarkerType compileGraphemeBoundary(GraphemeBoundary *gb, MarkerType marker, pablo::PabloBuilder & pb);
     89
     90    MarkerType AdvanceMarker(MarkerType marker, const MarkerPosition newpos, pablo::PabloBuilder & pb);
     91    void AlignMarkers(MarkerType & m1, MarkerType & m2, pablo::PabloBuilder & pb);
    9292
    9393private:
     
    9898    pablo::PabloAST *                               mUnicodeLineBreak;
    9999    pablo::PabloAST *                               mAny;
    100     pablo::PabloAST *                               mGraphemeExtenderRule;
     100    pablo::PabloAST *                               mGraphemeBoundaryRule;
    101101    pablo::PabloAST *                               mInitial;
    102102    pablo::Assign *                                 mNonFinal;   
     
    106106    std::vector<pablo::Next *>                      mLoopVariants; // <- rethink name
    107107    pablo::PabloBuilder                             mPB;
    108     std::unordered_map<Name *, MarkerType>          mCompiledName;   
     108    std::unordered_map<Name *, MarkerType>          mCompiledName;
    109109    pablo::PabloFunction &                          mFunction;
    110110};
  • icGREP/icgrep-devel/icgrep/re/re_grapheme_boundary.hpp

    r4833 r4841  
    1616
    1717    enum class Type {
    18         ClusterBoundary     // g
    19         , WordBoundary      // w
    20         , LineBreakBoundary // l
    21         , SentenceBoundary // s
     18        ClusterBoundary = 0     // g
     19        , WordBoundary = 1      // w
     20        , LineBreakBoundary = 2 // l
     21        , SentenceBoundary = 3 // s
    2222    };
    2323
     24    enum class Sense {Positive, Negative};
     25
     26    inline Type getType() const { return mType; }
     27    GraphemeBoundary::Sense getSense() const {return mSense;}
    2428    inline RE * getExpression() const {return mExpression;}
    25     inline void setExpression(RE * const r) {mExpression = r; }
    26     inline Name * getGraphemeExtenderRule() const {return mBoundaryRule;}
    27     inline void setBoundaryRule(Name * const r) {mBoundaryRule = r; }
    28     inline Type getType() const {return mType;}
     29    inline void setExpression(RE * const r) { mExpression = r; }
     30    inline Name * getBoundaryRule() const {return mBoundaryRule;}
     31    inline void setBoundaryRule(Name * const r) { mBoundaryRule = r; }
    2932
    3033protected:
    31     friend GraphemeBoundary * makeGraphemeBoundary(RE * const expression, const Type type);
    32     GraphemeBoundary(RE * const expression, const Type type) : RE(ClassTypeId::GraphemeBoundary), mExpression(expression), mBoundaryRule(nullptr), mType(type) {}
     34    friend GraphemeBoundary * makeGraphemeBoundary(const Type type, const Sense sense, RE * expression);
     35    GraphemeBoundary(const Type type, const Sense sense, RE * expression) : RE(ClassTypeId::GraphemeBoundary), mType(type), mSense(sense), mExpression(expression), mBoundaryRule(nullptr) {}
    3336    virtual ~GraphemeBoundary() {}
    3437
    3538private:
    36     RE *        mExpression;
    37     Name *      mBoundaryRule;
    38     const Type  mType;
     39    const Type      mType;
     40    const Sense     mSense;
     41    RE *            mExpression;
     42    Name *          mBoundaryRule;
    3943};
    4044
    41 inline GraphemeBoundary * makeGraphemeBoundary(RE * const expression, const GraphemeBoundary::Type type) {
    42     return new GraphemeBoundary(expression, type);
     45inline GraphemeBoundary * makeGraphemeBoundary(const GraphemeBoundary::Type type, const GraphemeBoundary::Sense sense, RE * expression) {
     46    return new GraphemeBoundary(type, sense, expression);
     47}
     48
     49inline GraphemeBoundary * makeGraphemeClusterBoundary(const GraphemeBoundary::Sense sense = GraphemeBoundary::Sense::Positive, RE * expression = nullptr) {
     50    return makeGraphemeBoundary(GraphemeBoundary::Type::ClusterBoundary, sense, expression);
    4351}
    4452
  • icGREP/icgrep-devel/icgrep/re/re_nullable.cpp

    r4829 r4841  
    4949            name->setDefinition(removeNullablePrefix(name->getDefinition()));
    5050        }
    51     } else if (GraphemeBoundary * g = dyn_cast<GraphemeBoundary>(re)) {
    52         g->setExpression(removeNullablePrefix(g->getExpression()));
    5351    }
    5452    return re;
     
    8684            name->setDefinition(removeNullableSuffix(name->getDefinition()));
    8785        }
    88     } else if (GraphemeBoundary * g = dyn_cast<GraphemeBoundary>(re)) {
    89         g->setExpression(removeNullableSuffix(g->getExpression()));
    9086    }
    9187    return re;
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4839 r4841  
    4747        throw ParseFailure("An unexpected parsing error occurred!");
    4848    }
    49     if (parser.fModeFlagSet & ModeFlagType::GRAPHEME_CLUSTER_MODE) {
    50         re = makeGraphemeBoundary(re, GraphemeBoundary::Type::ClusterBoundary);
    51     }
    5249    return re;
    5350}
     
    7269
    7370RE * RE_Parser::parse_RE() {
    74     RE * r = parse_alt();
    75     return r;
     71    return parse_alt();
    7672}
    7773
     
    8076    for (;;) {
    8177        alt.push_back(parse_seq());
    82         if (mCursor.noMore() || *mCursor != '|') {
     78        if (*mCursor != '|') {
    8379            break;
    8480        }
    8581        ++mCursor; // advance past the alternation character '|'
    8682    }
    87     if (alt.empty())
    88     {
     83    if (alt.empty()) {
    8984        throw NoRegularExpressionFound();
    9085    }
     
    106101
    107102RE * RE_Parser::parse_next_item() {
    108     RE * re = nullptr;
    109103    if (mCursor.more()) {
    110104        switch (*mCursor) {
     
    119113                return makeEnd();
    120114            case '|': case ')':
    121                 return nullptr;  // This is ugly.
     115                break;
    122116            case '*': case '+': case '?': case '{':
    123117                throw NothingToRepeat();
     
    126120                    return createCC(parse_utf8_codepoint());
    127121                }
    128                 else throw ParseFailure("Use  \\] for literal ].");
     122                throw ParseFailure("Use  \\] for literal ].");
    129123            case '}':
    130124                if (fNested) {
    131                     return nullptr;  //  a recursive invocation for a regexp in \N{...}
    132                 }
    133                 else if (LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) {
     125                    break;  //  a recursive invocation for a regexp in \N{...}
     126                } else if (LEGACY_UNESCAPED_RBRAK_RBRACE_ALLOWED) {
    134127                    return createCC(parse_utf8_codepoint());
    135128                }
    136                 else throw ParseFailure("Use \\} for literal }.");
     129                throw ParseFailure("Use \\} for literal }.");
    137130            case '[':
    138131                mCursor++;
     
    148141        }
    149142    }
    150     return re;
    151 }
    152 
    153 
    154 // Parse some kind of parenthesized group.  Input precondition: _cursor
     143    return nullptr;
     144}
     145
     146
     147// Parse some kind of parenthesized group.  Input precondition: mCursor
    155148// after the (
    156149RE * RE_Parser::parse_group() {
    157     RE * subexpr;
    158     RE * group_expr;
    159     ModeFlagSet savedModeFlags = fModeFlagSet;
     150    const ModeFlagSet modeFlagSet = fModeFlagSet;
     151    RE * group_expr = nullptr;
    160152    if (*mCursor == '?') {
    161153        switch (*++mCursor) {
    162154            case '#':  // comment
    163                 ++mCursor;
    164                 while (mCursor.more() && *mCursor != ')') {
    165                     ++mCursor;
    166                 }
     155                while (*++mCursor != ')');
    167156                ++mCursor;
    168157                return parse_next_item();
     
    173162            case '=':
    174163                ++mCursor;
    175                 subexpr = parse_alt();
    176                 group_expr = makeLookAheadAssertion(subexpr);
     164                group_expr = makeLookAheadAssertion(parse_alt());
    177165                break;
    178166            case '!':
    179167                ++mCursor;
    180                 subexpr = parse_alt();
    181                 group_expr = makeNegativeLookAheadAssertion(subexpr);
     168                group_expr = makeNegativeLookAheadAssertion(parse_alt());
    182169                break;
    183170            case '>':
    184171                ++mCursor;
    185                 subexpr = parse_alt();
    186                 group_expr = makeAtomicGroup(subexpr);
     172                group_expr = makeAtomicGroup(parse_alt());
    187173                break;
    188174            case '|':
    189175                ++mCursor;
    190                 subexpr = parse_alt();
    191                 group_expr = makeBranchResetGroup(subexpr);
     176                group_expr = makeBranchResetGroup(parse_alt());
    192177                break;
    193178            case '<':
     
    195180                if (*mCursor == '=') {
    196181                    ++mCursor;
    197                     subexpr = parse_alt();
    198                     group_expr = makeLookBehindAssertion(subexpr);
     182                    group_expr = makeLookBehindAssertion(parse_alt());
    199183                }
    200184                else if (*mCursor == '!') {
    201185                    ++mCursor;
    202                     subexpr = parse_alt();
    203                     group_expr = makeNegativeLookBehindAssertion(subexpr);
     186                    group_expr = makeNegativeLookBehindAssertion(parse_alt());
    204187                } else {
    205188                    throw ParseFailure("Illegal lookbehind assertion syntax.");
    206189                }
    207190                break;
    208             case '-': case 'd' : case 'i': case 'm': case 's': case 'x': case 'g': {
    209                 bool negateMode = false;
    210                 ModeFlagType modeBit;
     191            case '-': case 'd' : case 'i': case 'm': case 's': case 'x': case 'g':
    211192                while (*mCursor != ')' && *mCursor != ':') {
     193                    bool negateMode = false;
     194                    ModeFlagType modeBit;
    212195                    if (*mCursor == '-') {
    213196                        negateMode = true;
    214197                        ++mCursor;
    215198                    }
    216                     switch (*mCursor++) {
     199                    switch (*mCursor) {
    217200                        case 'i': modeBit = CASE_INSENSITIVE_MODE_FLAG; break;
    218201                        case 'g': modeBit = GRAPHEME_CLUSTER_MODE; break;
     
    223206                        default: throw ParseFailure("Unsupported mode flag.");
    224207                    }
     208                    ++mCursor;
    225209                    if (negateMode) {
    226210                        fModeFlagSet &= ~modeBit;
     
    233217                    ++mCursor;
    234218                    group_expr = parse_alt();
     219                    fModeFlagSet = modeFlagSet;
     220                    break;
    235221                } else {  // if *_cursor == ')'
    236222                    ++mCursor;
    237                     // return immediately without restoring mode flags
    238223                    return parse_next_item();
    239224                }
    240                 break;
    241             }
    242225            default:
    243226                throw ParseFailure("Illegal (? syntax.");
     
    246229        group_expr = parse_alt();
    247230    }
    248     // Restore mode flags.
    249     fModeFlagSet = savedModeFlags;
    250231    if (*mCursor != ')') {
    251232        throw ParseFailure("Closing parenthesis required.");
     
    255236}
    256237
    257 RE * RE_Parser::extend_item(RE * re) {
    258     if (mCursor.noMore()) {
    259         return re;
    260     } else {
     238inline RE * RE_Parser::extend_item(RE * re) {
     239    if (LLVM_LIKELY(mCursor.more())) {
    261240        int lb = 0, ub = 0;
     241        bool hasRep = true;
    262242        switch (*mCursor) {
    263243            case '*':
     
    277257                break;
    278258            default:
    279                 return re;
    280         }
    281         if (lb > MAX_REPETITION_LOWER_BOUND || ub > MAX_REPETITION_UPPER_BOUND) {
    282             throw ParseFailure("Bounded repetition exceeds icgrep implementation limit");
    283         }
    284         if ((ub != Rep::UNBOUNDED_REP) && (lb > ub)) {
    285             throw ParseFailure("Lower bound cannot exceed upper bound in bounded repetition");
    286         }
    287         ++mCursor;
    288         if (*mCursor == '?') { // Non-greedy qualifier
    289             // Greedy vs. non-greedy makes no difference for icgrep.
    290             ++mCursor;
    291         } else if (*mCursor == '+') {
    292             ++mCursor;
    293             throw ParseFailure("Possessive repetition is not supported in icgrep 1.0");
    294         }
    295         return makeRep(re, lb, ub);
    296     }
     259                hasRep = false;
     260        }
     261        if (hasRep) {
     262            if (lb > MAX_REPETITION_LOWER_BOUND || ub > MAX_REPETITION_UPPER_BOUND) {
     263                throw ParseFailure("Bounded repetition exceeds icgrep implementation limit");
     264            }
     265            if ((ub != Rep::UNBOUNDED_REP) && (lb > ub)) {
     266                throw ParseFailure("Lower bound cannot exceed upper bound in bounded repetition");
     267            }
     268            ++mCursor;
     269            if (*mCursor == '?') { // Non-greedy qualifier
     270                // Greedy vs. non-greedy makes no difference for icgrep.
     271                ++mCursor;
     272            } else if (*mCursor == '+') {
     273                ++mCursor;
     274                throw ParseFailure("Possessive repetition is not supported in icgrep 1.0");
     275            }
     276            re = makeRep(re, lb, ub);
     277        }
     278    }
     279    if ((fModeFlagSet & ModeFlagType::GRAPHEME_CLUSTER_MODE) != 0) {
     280        re = makeGraphemeClusterBoundary(GraphemeBoundary::Sense::Positive, re);
     281    }
     282    return re;
    297283}
    298284
     
    345331    }
    346332}
    347    
    348 inline RE * makeGraphemeClusterBoundary() {
    349     throw ParseFailure("makeGraphemeClusterBoundary() not yet supported.");
    350 }
    351    
    352333   
    353334RE * RE_Parser::parseEscapedSet() {
     
    361342            } else {
    362343                switch (*++mCursor) {
    363                     case 'g': re = makeGraphemeClusterBoundary();
     344                    case 'g':
     345                        re = makeGraphemeClusterBoundary(complemented ? GraphemeBoundary::Sense::Negative : GraphemeBoundary::Sense::Positive);
     346                        break;
    364347                    case 'w': throw ParseFailure("\\b{w} not yet supported.");
    365348                    case 'l': throw ParseFailure("\\b{l} not yet supported.");
     
    371354                }
    372355                ++mCursor;
    373                 return complemented ? makeComplement(re) : re;
     356                return re;
    374357            }
    375358        case 'd':
     
    421404            // to get to the next extended grapheme cluster boundary.
    422405            ++mCursor;
    423             return makeGraphemeBoundary(makeAny(), GraphemeBoundary::Type::ClusterBoundary);
     406            // return makeSeq({makeRep(makeAny(), 1, Rep::UNBOUNDED_REP), makeGraphemeClusterBoundary()});
     407            return makeGraphemeClusterBoundary(GraphemeBoundary::Sense::Positive, makeAny());
    424408        case 'N':
    425409            if (*++mCursor != '{') {
  • icGREP/icgrep-devel/icgrep/re/re_simplifier.cpp

    r4405 r4841  
    11#include "re_simplifier.h"
    2 #include "re_cc.h"
    3 #include "re_name.h"
    4 #include "re_start.h"
    5 #include "re_end.h"
    6 #include "re_seq.h"
    7 #include "re_alt.h"
    8 #include "re_rep.h"
    9 #include "re_diff.h"
    10 #include "re_intersect.h"
    11 #include "re_assertion.h"
     2#include <re/re_name.h>
     3#include <re/re_any.h>
     4#include <re/re_start.h>
     5#include <re/re_end.h>
     6#include <re/re_alt.h>
     7#include <re/re_cc.h>
     8#include <re/re_seq.h>
     9#include <re/re_rep.h>
     10#include <re/re_diff.h>
     11#include <re/re_intersect.h>
     12#include <re/re_assertion.h>
     13#include <re/re_grapheme_boundary.hpp>
    1214#include <algorithm>
    1315#include <memory>
     
    2426        }
    2527        re = makeAlt(list.begin(), list.end());
    26     }
    27     else if (Seq * seq = dyn_cast<Seq>(re)) {
     28    } else if (Seq * seq = dyn_cast<Seq>(re)) {
    2829        std::vector<RE*> list;
    2930        list.reserve(seq->size());
     
    3233        }
    3334        re = makeSeq(list.begin(), list.end());
    34     }
    35     else if (Assertion * a = dyn_cast<Assertion>(re)) {
     35    } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    3636        re = makeAssertion(simplify(a->getAsserted()), a->getKind(), a->getSense());
    37     }
    38     else if (Rep * rep = dyn_cast<Rep>(re)) {
     37    } else if (Rep * rep = dyn_cast<Rep>(re)) {
    3938        re = makeRep(simplify(rep->getRE()), rep->getLB(), rep->getUB());
    40     }
    41     else if (Diff * diff = dyn_cast<Diff>(re)) {
     39    } else if (Diff * diff = dyn_cast<Diff>(re)) {
    4240        re = makeDiff(simplify(diff->getLH()), diff->getRH());
    43     }
    44     else if (Intersect * e = dyn_cast<Intersect>(re)) {
     41    } else if (Intersect * e = dyn_cast<Intersect>(re)) {
    4542        re = makeIntersect(simplify(e->getLH()), e->getRH());
    4643    }
  • icGREP/icgrep-devel/icgrep/re/re_simplifier.h

    r4203 r4841  
    22#define RE_SIMPLIFIER_H
    33
    4 //Regular Expressions
    5 #include "re_seq.h"
    6 #include <list>
    7 
    84namespace re {
    95
    10 class Alt;
     6class RE;
    117
    128class RE_Simplifier {
Note: See TracChangeset for help on using the changeset viewer.