Ignore:
Timestamp:
Oct 17, 2015, 4:25:05 PM (3 years ago)
Author:
nmedfort
Message:

Update for grapheme cluster mode and boundaries.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4835 r4841  
    174174                            const UCD::ExternalProperty & ep = UCD::resolveExternalProperty(functionName);
    175175                            Call * call = mPB.createCall(Prototype::Create(functionName, std::get<1>(ep), std::get<2>(ep), std::get<0>(ep)), mCCCompiler.getBasisBits());
    176                             name->setCompiled(mPB.createAnd(call, mAny));
     176                            name->setCompiled(call);
    177177                        } else {
    178178                        #endif
     
    241241            }
    242242        } else if (GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    243             if (LLVM_LIKELY(gb->getGraphemeExtenderRule() == nullptr)) {
     243            if (LLVM_LIKELY(gb->getBoundaryRule() == nullptr)) {
    244244                switch (gb->getType()) {
    245245                    case GraphemeBoundary::Type::ClusterBoundary:
    246246                        if (graphemeClusterRule == nullptr) {
    247                             graphemeClusterRule = cast<Name>(resolve(generateGraphemeClusterExtenderRule()));
     247                            graphemeClusterRule = cast<Name>(resolve(generateGraphemeClusterBoundaryRule()));
    248248                        }
    249249                        gb->setBoundaryRule(graphemeClusterRule);
     
    253253                }
    254254            }
    255             gb->setExpression(resolve(gb->getExpression()));
     255            if (gb->getExpression()) {
     256                resolve(gb->getExpression());
     257            }
    256258        }
    257259        return re;
     
    261263    std::unordered_set<Name *> visited;
    262264
    263     std::function<void(RE*)> gather = [&](RE * re) {
     265    std::function<void(RE*)> gather = [&](RE * re) {       
    264266        if (Name * name = dyn_cast<Name>(re)) {
    265267            if (visited.insert(name).second) {
     
    289291            gather(ix->getRH());
    290292        } else if (GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    291             gather(gb->getExpression());
    292             gather(gb->getGraphemeExtenderRule());
     293            if (gb->getExpression()) {
     294                gather(gb->getExpression());
     295            }
     296            gather(gb->getBoundaryRule());
    293297        }
    294298    };
     
    302306        for (auto t : nameMap) {
    303307            if (t.second) {
    304                 mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, mPB.createAnd(t.second, mAny))));
     308                mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, t.second)));
    305309            }
    306310        }
     
    309313    // Now precompile any grapheme segmentation rules
    310314    if (graphemeClusterRule) {
    311         mCompiledName.insert(std::make_pair(graphemeClusterRule, compileName(graphemeClusterRule, mPB)));
     315        auto gcb = compileName(graphemeClusterRule, mPB);
     316        mCompiledName.insert(std::make_pair(graphemeClusterRule, gcb));
    312317    }
    313318    return re;
     
    318323}
    319324
    320 Name * RE_Compiler::generateGraphemeClusterExtenderRule() {
     325Name * RE_Compiler::generateGraphemeClusterBoundaryRule() {
    321326    // 3.1.1 Grapheme Cluster Boundary Rules
    322327    #define Behind(x) makeLookBehindAssertion(x)
     
    324329
    325330    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
    326 
     331    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
     332    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
     333
     334    // Break at the start and end of text.
    327335    RE * GCB_1 = makeStart();
    328336    RE * GCB_2 = makeEnd();
     337    // Do not break between a CR and LF.
     338    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
     339    // Otherwise, break before and after controls.
    329340    RE * GCB_4 = Behind(GCB_Control);
    330341    RE * GCB_5 = Ahead(GCB_Control);
    331 
    332     RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeSeq({GCB_4, GCB_5})});
     342    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeSeq({GCB_4, GCB_5}), GCB_3)});
    333343
    334344    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
     
    338348    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
    339349    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
    340     // Legacy rules
     350    // Do not break Hangul syllable sequences.
    341351    RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
    342352    RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
    343353    RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
     354    // Do not break between regional indicator symbols.
    344355    RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
     356    // Do not break before extending characters.
    345357    RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
    346     // Extended rules
     358    // Do not break before SpacingMarks, or after Prepend characters.
    347359    RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
    348360    RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
    349 
    350361    RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
     362    // Otherwise, break everywhere.
     363    RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
     364
    351365    Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    352     gcb->setDefinition(makeDiff(GCB_6_9b,  GCB_1_5));
    353 
     366    gcb->setDefinition(makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)}));
    354367    return gcb;
    355368}
     
    550563
    551564inline PabloAST * RE_Compiler::consecutive_matches(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
    552         int i = length;
    553         int total = repeat_count * length;
    554         PabloAST * consecutive_i = repeated;
    555         while (i * 2 < total) {
    556             PabloAST * v = consecutive_i;
    557             PabloAST * v2 =  pb.createAdvance(v, i);
    558             i *= 2;
    559             consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
    560         }       
    561         if (i < total) {
    562             PabloAST * v = consecutive_i;
    563             consecutive_i = pb.createAnd(v, pb.createAdvance(v, total - i), "at" + std::to_string(total));
    564         }
    565         return consecutive_i;
    566 }
    567 
    568 inline PabloAST * RE_Compiler::reachable(PabloAST *repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
    569         int i = repeated_lgth;
    570         int total_lgth = repeat_count * repeated_lgth;
    571         if (repeat_count == 0) {
    572             return repeated;
    573         }
    574         PabloAST * reachable_i = pb.createOr(repeated, pb.createAdvance(repeated, 1), "within1");
    575         while (i * 2 < total_lgth) {
    576             PabloAST * v = reachable_i;
    577             PabloAST * v2 =  pb.createAdvance(v, i);
    578             i *= 2;
    579             reachable_i = pb.createOr(v, v2, "within" + std::to_string(i));
    580         }       
    581         if (i < total_lgth) {
    582             PabloAST * v = reachable_i;
    583             reachable_i = pb.createOr(v, pb.createAdvance(v, total_lgth - i), "within" + std::to_string(total_lgth));
    584         }
    585         return reachable_i;
     565    int i = length;
     566    int total = repeat_count * length;
     567    PabloAST * consecutive_i = repeated;
     568    while (i * 2 < total) {
     569        PabloAST * v = consecutive_i;
     570        PabloAST * v2 =  pb.createAdvance(v, i);
     571        i *= 2;
     572        consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
     573    }
     574    if (i < total) {
     575        PabloAST * v = consecutive_i;
     576        consecutive_i = pb.createAnd(v, pb.createAdvance(v, total - i), "at" + std::to_string(total));
     577    }
     578    return consecutive_i;
     579}
     580
     581inline PabloAST * RE_Compiler::reachable(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
     582    int i = length;
     583    int total_lgth = repeat_count * length;
     584    if (repeat_count == 0) {
     585        return repeated;
     586    }
     587    PabloAST * reachable_i = pb.createOr(repeated, pb.createAdvance(repeated, 1), "within1");
     588    while (i * 2 < total_lgth) {
     589        PabloAST * v = reachable_i;
     590        PabloAST * v2 =  pb.createAdvance(v, i);
     591        i *= 2;
     592        reachable_i = pb.createOr(v, v2, "within" + std::to_string(i));
     593    }
     594    if (i < total_lgth) {
     595        PabloAST * v = reachable_i;
     596        reachable_i = pb.createOr(v, pb.createAdvance(v, total_lgth - i), "within" + std::to_string(total_lgth));
     597    }
     598    return reachable_i;
    586599}
    587600
     
    633646        }
    634647        return makeMarker(MarkerPosition::InitialPostPositionByte, mstar);
    635     }
    636     else if (isUnicodeUnitLength(repeated) && !DisableMatchStar && !DisableUnicodeMatchStar) {
     648    } else if (isUnicodeUnitLength(repeated) && !DisableMatchStar && !DisableUnicodeMatchStar) {
    637649        PabloAST * cc = markerVar(compile(repeated, pb));
    638650        PabloAST * mstar = nullptr;
    639651        PabloAST * nonFinal = mNonFinal;
    640         if (mGraphemeExtenderRule) {
    641             nonFinal = pb.createOr(nonFinal, mGraphemeExtenderRule);
     652        if (mGraphemeBoundaryRule) {
     653            nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
    642654        }
    643655        cc = pb.createOr(cc, nonFinal);
     
    648660        }
    649661        PabloAST * final = mFinal;
    650         if (mGraphemeExtenderRule) {
    651             final = pb.createOr(final, pb.createNot(mGraphemeExtenderRule));
     662        if (mGraphemeBoundaryRule) {
     663            final = mGraphemeBoundaryRule;
    652664        }
    653665        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(mstar, final, "unbounded"));
     
    705717    if (UNICODE_LINE_BREAK) {
    706718        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
    707         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
     719        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "eol"));
    708720    } else {
    709721        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
     
    712724}
    713725
    714 inline MarkerType RE_Compiler::compileGraphemeBoundary(GraphemeBoundary * gb, const MarkerType marker, pablo::PabloBuilder & pb) {
    715     const auto inGraphemeBoundaryRule = mGraphemeExtenderRule;
    716     auto f = mCompiledName.find(gb->getGraphemeExtenderRule());
    717     if (LLVM_UNLIKELY(f == mCompiledName.end())) {
    718         throw std::runtime_error("Internal error: failed to locate grapheme boundary rule!");
    719     }
    720     mGraphemeExtenderRule = markerVar(f->second);
    721     assert (mGraphemeExtenderRule);
    722     MarkerType result = process(gb->getExpression(), marker, pb);
    723     mGraphemeExtenderRule = inGraphemeBoundaryRule;
    724     return result;
    725 }
    726 
    727 inline MarkerType RE_Compiler::AdvanceMarker(const MarkerType m, const MarkerPosition newpos, PabloBuilder & pb) {
    728     if (m.pos == newpos) return m;
    729     PabloAST * a = m.stream;
    730     if (m.pos == MarkerPosition::FinalMatchByte) {
    731         // Must advance the previous marker to the InitialPostPositionByte
    732         a = pb.createAdvance(a, 1, "ipp");
    733     }
    734     // Now at InitialPostPositionByte; is a further advance needed?
    735     if (newpos == MarkerPosition::FinalPostPositionByte) {
    736         // Must advance through nonfinal bytes
    737         PabloAST * nonFinal = mNonFinal;
    738         if (mGraphemeExtenderRule) {
    739             nonFinal = pb.createOr(nonFinal, mGraphemeExtenderRule, "gext");
    740         }
    741         a = pb.createScanThru(pb.createAnd(mInitial, a), nonFinal, "fpp");
    742     }
    743     return {newpos, a};
     726inline MarkerType RE_Compiler::compileGraphemeBoundary(GraphemeBoundary * gb, MarkerType marker, pablo::PabloBuilder & pb) {
     727    auto f = mCompiledName.find(gb->getBoundaryRule());
     728    assert ("Internal error: failed to locate grapheme boundary rule!" && (f != mCompiledName.end()));
     729    if (gb->getExpression()) {
     730        const auto graphemeBoundaryRule = mGraphemeBoundaryRule;
     731        mGraphemeBoundaryRule = markerVar(f->second);
     732        marker = process(gb->getExpression(), marker, pb);
     733        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     734        mGraphemeBoundaryRule = graphemeBoundaryRule;
     735    } else {
     736        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     737        PabloAST * rule = markerVar(f->second);
     738        if (gb->getSense() == GraphemeBoundary::Sense::Negative) {
     739            rule = pb.createNot(rule);
     740        }
     741        marker = makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(marker), rule, "gb"));
     742    }
     743    return marker;
     744}
     745
     746inline MarkerType RE_Compiler::AdvanceMarker(MarkerType marker, const MarkerPosition newpos, PabloBuilder & pb) {
     747    if (marker.pos != newpos) {
     748        if (marker.pos == MarkerPosition::FinalMatchByte) {
     749            marker.stream = pb.createAdvance(marker.stream, 1, "ipp");
     750            marker.pos = MarkerPosition::InitialPostPositionByte;
     751        }
     752        if (newpos == MarkerPosition::FinalPostPositionByte) {
     753            PabloAST * nonFinal = mNonFinal;
     754            if (mGraphemeBoundaryRule) {
     755                nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
     756            }
     757            marker.stream = pb.createScanThru(pb.createAnd(mInitial, marker.stream), nonFinal, "fpp");
     758            marker.pos = MarkerPosition::FinalPostPositionByte;
     759        }
     760    }
     761    return marker;
    744762}
    745763
     
    758776, mUnicodeLineBreak(nullptr)
    759777, mAny(nullptr)
    760 , mGraphemeExtenderRule(nullptr)
     778, mGraphemeBoundaryRule(nullptr)
    761779, mInitial(nullptr)
    762780, mNonFinal(nullptr)
Note: See TracChangeset for help on using the changeset viewer.