Ignore:
Timestamp:
Nov 2, 2018, 7:18:31 PM (9 months ago)
Author:
nmedfort
Message:

Initial version of PipelineKernel? + revised StreamSet? model.

Location:
icGREP/icgrep-devel/icgrep/re
Files:
9 edited
1 copied

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp

    r6181 r6184  
    55 */
    66
     7#include "decomposition.h"
    78#include <string>
    89#include <vector>
    910#include <locale>
    1011#include <codecvt>
    11 #include <re/Unicode/decomposition.h>
    1212#include <re/re_cc.h>
    1313#include <re/re_seq.h>
     
    1818#include <re/re_intersect.h>
    1919#include <re/re_assertion.h>
     20#include <re/re_toolchain.h>
    2021#include <UCD/unicode_set.h>
    2122#include <UCD/PropertyAliases.h>
     
    2425#include <UCD/PropertyValueAliases.h>
    2526#include <llvm/Support/Casting.h>
     27
    2628
    2729using namespace llvm;
     
    5961    return rslt;
    6062}
     63
     64class NFD_Transformer final : public re::RE_Transformer {
     65public:
     66    /* Transforme an RE so that all string pieces and character classes
     67     are converted to NFD form (or NFKD form if the Compatible option
     68     is used.  The options may also including case folding.  Example:
     69     NFD_Transformer(CaseFold | NFKD).transformRE(r);
     70    */
     71    NFD_Transformer(DecompositionOptions opt);
     72    /* Helpers to convert and append an individual codepoint or a u32string
     73       to an existing NFD_string.   The process performs any necessary
     74       reordering of marks of the existing string and the appended data
     75       to ensure that the result is overall in NFD form.
     76       These may be used independently of RE transformation, for example:
     77       NFD_Transformer(CaseFold).NFD_append1(s, cp);
     78    */
     79    void NFD_append1(std::u32string & NFD_string, codepoint_t cp);
     80    void NFD_append(std::u32string & NFD_string, std::u32string & to_convert);
     81protected:
     82    re::RE * transformCC(re::CC * cc) override;
     83    re::RE * transformSeq(re::Seq * seq) override;
     84    re::RE * transformGroup(re::Group * g) override;
     85    bool reordering_needed(std::u32string & prefix, codepoint_t suffix_cp);
     86private:
     87    DecompositionOptions mOptions;
     88    EnumeratedPropertyObject * decompTypeObj;
     89    StringPropertyObject * decompMappingObj;
     90    EnumeratedPropertyObject * cccObj;
     91    StringOverridePropertyObject * caseFoldObj;
     92    const UnicodeSet & canonicalMapped;
     93    const UnicodeSet & cc0Set;
     94    const UnicodeSet selfNFKD;
     95    const UnicodeSet selfCaseFold;
     96    const UnicodeSet HangulPrecomposed;
     97    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
     98};
    6199   
    62100NFD_Transformer::NFD_Transformer(DecompositionOptions opt) :
    63     RE_Transformer("toNFD"),
    64     mOptions(opt),
    65     decompTypeObj(cast<EnumeratedPropertyObject>(property_object_table[dt])),
    66     decompMappingObj(cast<StringPropertyObject>(property_object_table[dm])),
    67     cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
    68     caseFoldObj(cast<StringOverridePropertyObject>(property_object_table[cf])),
    69     canonicalMapped(decompTypeObj->GetCodepointSet(DT_ns::Can)),
    70     cc0Set(cccObj->GetCodepointSet(CCC_ns::NR)),
    71     selfNFKD(decompMappingObj->GetReflexiveSet()),
    72     selfCaseFold(caseFoldObj->GetReflexiveSet())
    73 {}
    74 
    75 static UnicodeSet HangulPrecomposed = UnicodeSet(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1);
     101RE_Transformer("toNFD"),
     102mOptions(opt),
     103decompTypeObj(cast<EnumeratedPropertyObject>(property_object_table[dt])),
     104decompMappingObj(cast<StringPropertyObject>(property_object_table[dm])),
     105cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
     106caseFoldObj(cast<StringOverridePropertyObject>(property_object_table[cf])),
     107canonicalMapped(decompTypeObj->GetCodepointSet(DT_ns::Can)),
     108cc0Set(cccObj->GetCodepointSet(CCC_ns::NR)),
     109selfNFKD(std::move(decompMappingObj->GetReflexiveSet())),
     110selfCaseFold(std::move(caseFoldObj->GetReflexiveSet())),
     111HangulPrecomposed(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1) {
     112
     113}
    76114
    77115bool hasOption(enum DecompositionOptions optionSet, enum DecompositionOptions testOption) {
     
    112150        NFD_string.pop_back();
    113151        NFD_append(NFD_string, reordered);
    114     } else if (hasOption(mOptions, UCD::CaseFold) && !selfCaseFold.contains(cp)) {
     152    } else if (hasOption(mOptions, CaseFold) && !selfCaseFold.contains(cp)) {
    115153        std::u32string dms = conv.from_bytes(caseFoldObj->GetStringValue(cp));
    116154        NFD_append(NFD_string, dms);
    117     } else if (hasOption(mOptions, UCD::NFKD) && (!selfNFKD.contains(cp))) {
     155    } else if (hasOption(mOptions, NFKD) && (!selfNFKD.contains(cp))) {
    118156        std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp));
    119157        NFD_append(NFD_string, dms);
     
    133171    re::Group::Sense sense = g->getSense();
    134172    auto r = g->getRE();
    135     UCD::DecompositionOptions saveOptions = mOptions;
     173    DecompositionOptions saveOptions = mOptions;
    136174    if (mode == re::Group::Mode::CaseInsensitiveMode) {
    137175        if (sense == re::Group::Sense::On) {
    138             mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::CaseFold);
     176            mOptions = static_cast<DecompositionOptions>(mOptions | CaseFold);
    139177        } else {
    140             mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::CaseFold);
     178            mOptions = static_cast<DecompositionOptions>(mOptions & ~CaseFold);
    141179        }
    142180    } else if (mode == re::Group::Mode::CompatibilityMode) {
    143181        if (sense == re::Group::Sense::On) {
    144             mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::NFKD);
     182            mOptions = static_cast<DecompositionOptions>(mOptions | NFKD);
    145183        } else {
    146             mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::NFKD);
     184            mOptions = static_cast<DecompositionOptions>(mOptions & ~NFKD);
    147185        }
    148186    }
     
    157195    if (cc->getAlphabet() != &cc::Unicode) return cc;
    158196    UnicodeSet mappingRequired = *cc & (canonicalMapped + HangulPrecomposed);
    159     if (hasOption(mOptions, UCD::CaseFold)) {
     197    if (hasOption(mOptions, CaseFold)) {
    160198        mappingRequired = mappingRequired + (*cc - selfCaseFold);
    161199    }
    162     if (hasOption(mOptions, UCD::NFKD)) {
     200    if (hasOption(mOptions, NFKD)) {
    163201        mappingRequired = mappingRequired + (*cc - selfNFKD);
    164202    }
     
    207245    return makeSeq(list.begin(), list.end());
    208246}
     247
     248RE * transform(RE * re, const DecompositionOptions opt) {
     249    return NFD_Transformer(opt).transformRE(re);
     250}
     251
    209252} // end namespace UCD
  • icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.h

    r6174 r6184  
    88#define DECOMPOSITION_H
    99
    10 #include <string>
    11 #include <locale>
    12 #include <codecvt>
    13 #include <re/re_toolchain.h>
    14 #include <UCD/unicode_set.h>
     10enum DecompositionOptions : int { NFD = 0, CaseFold = 1, NFKD = 2 };
    1511
    16 namespace re { class RE; class CC; class Seq;}
    17 namespace UCD { class EnumeratedPropertyObject; class StringPropertyObject; class StringOverridePropertyObject;}
     12namespace re { class RE; }
    1813
    1914namespace UCD {
    20     enum DecompositionOptions : int {NFD = 0, CaseFold = 1, NFKD = 2};
    2115
    22     class NFD_Transformer : public re::RE_Transformer {
    23     public:
    24         /* Transforme an RE so that all string pieces and character classes
    25          are converted to NFD form (or NFKD form if the UCD::Compatible option
    26          is used.  The options may also including case folding.  Example:
    27          UCD::NFD_Transformer(UCD::CaseFold | UCD::NFKD).transformRE(r);
    28         */
    29         NFD_Transformer(DecompositionOptions opt = NFD);
    30         /* Helpers to convert and append an individual codepoint or a u32string
    31            to an existing NFD_string.   The process performs any necessary
    32            reordering of marks of the existing string and the appended data
    33            to ensure that the result is overall in NFD form.
    34            These may be used independently of RE transformation, for example:
    35            UCD::NFD_Transformer(UCD::CaseFold).NFD_append1(s, cp);
    36         */
    37         void NFD_append1(std::u32string & NFD_string, codepoint_t cp);
    38         void NFD_append(std::u32string & NFD_string, std::u32string & to_convert);
    39     protected:
    40         re::RE * transformCC(re::CC * cc) override;
    41         re::RE * transformSeq(re::Seq * seq) override;
    42         re::RE * transformGroup(re::Group * g) override;
    43         bool reordering_needed(std::u32string & prefix, codepoint_t suffix_cp);
    44     private:
    45         DecompositionOptions mOptions;
    46         EnumeratedPropertyObject * decompTypeObj;
    47         StringPropertyObject * decompMappingObj;
    48         EnumeratedPropertyObject * cccObj;
    49         StringOverridePropertyObject * caseFoldObj;
    50         const UnicodeSet & canonicalMapped;
    51         const UnicodeSet & cc0Set;
    52         const UnicodeSet selfNFKD;
    53         const UnicodeSet selfCaseFold;
    54         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
    55     };
     16    re::RE * transform(re::RE * re, const DecompositionOptions opt = DecompositionOptions::NFD);
     17
    5618}
     19
     20
    5721#endif
  • icGREP/icgrep-devel/icgrep/re/collect_ccs.cpp

    r6173 r6184  
    5050
    5151
    52 std::vector<CC *> collectCCs(RE * const re, const cc::Alphabet * a, std::set<Name *> external) {
     52std::vector<CC *> collectCCs(RE * const re, const cc::Alphabet & a, std::set<Name *> external) {
    5353    std::vector<CC *> ccs;
    54     SetCollector collector(a, external, ccs);
     54    SetCollector collector(&a, external, ccs);
    5555    collector.inspectRE(re);
    5656    return ccs;
  • icGREP/icgrep-devel/icgrep/re/collect_ccs.h

    r6173 r6184  
    1212class Name;
    1313
    14 std::vector<CC *> collectCCs(RE * const re, const cc::Alphabet * a, std::set<Name *> external = {});
     14std::vector<CC *> collectCCs(RE * const re, const cc::Alphabet & a, std::set<Name *> external = {});
    1515
    1616}
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r6133 r6184  
    6969
    7070    void addAlphabet(cc::Alphabet * a, std::vector<pablo::PabloAST* > basis_set);
     71
     72    void addAlphabet(const std::shared_ptr<cc::Alphabet> & a, std::vector<pablo::PabloAST* > basis_set) {
     73        addAlphabet(a.get(), basis_set);
     74    }
    7175   
    7276    void addPrecompiled(std::string precompiledName, pablo::PabloAST * precompiledStream);
  • icGREP/icgrep-devel/icgrep/re/re_memoizer.cpp

    r6172 r6184  
    130130        case Type::Seq:
    131131            return lessThan(cast<Vector>(lh), cast<Vector>(rh));
    132         case Type::Any: case Type::End: case Type::Start:
     132        case Type::End: case Type::Start:
    133133            return false;
    134134        case Type::Assertion:
  • icGREP/icgrep-devel/icgrep/re/re_multiplex.cpp

    r6173 r6184  
    1010#include <re/re_group.h>
    1111#include <re/re_analysis.h>
     12#include <re/re_utility.h>
    1213#include <re/printer_re.h>
    1314#include <re/re_toolchain.h>
  • icGREP/icgrep-devel/icgrep/re/re_multiplex.h

    r6167 r6184  
    22#define RE_MULTIPLEX_H
    33
    4 #include <UCD/ucd_compiler.hpp>
    5 #include <cc/multiplex_CCs.h>
    6 #include <re/re_utility.h>
     4#include <memory>
     5
     6namespace cc { class MultiplexedAlphabet; }
    77
    88namespace re {
     9    class RE;
     10    RE * transformCCs(cc::MultiplexedAlphabet * mpx, RE * r);   
    911
    10     class RE;
    11     class Name;
    12     class CC;
    13 
    14     RE * transformCCs(cc::MultiplexedAlphabet * mpx, RE * r);
    15 
    16    
     12    inline RE * transformCCs(const std::shared_ptr<cc::MultiplexedAlphabet> & mpx, RE * r) {
     13        return transformCCs(mpx.get(), r);
     14    }
    1715}
    1816#endif
  • icGREP/icgrep-devel/icgrep/re/re_simplifier.cpp

    r6174 r6184  
    88#include <re/re_intersect.h>
    99#include <re/re_assertion.h>
     10#include <re/re_toolchain.h>
    1011#include <re/re_toolchain.h>
    1112#include <boost/container/flat_set.hpp>
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r6181 r6184  
    9292    validateNamesDefined(r);
    9393    if (UnicodeLevel2 && validateAlphabet(&cc::Unicode, r)) {
    94         r = UCD::NFD_Transformer().transformRE(r);
     94        r = UCD::transform(r);
    9595        r = UCD::addClusterMatches(r);
    9696        r = UCD::addEquivalentCodepoints(r);
     
    272272    RE * initialRE = re;
    273273    RE * finalRE = transform(re);
    274     if ((mTransformationName != "") && (PrintOptions.isSet(ShowAllREs) || (PrintOptions.isSet(ShowREs) && (initialRE != finalRE))))  {
     274    if ((!mTransformationName.empty()) && (PrintOptions.isSet(ShowAllREs) || (PrintOptions.isSet(ShowREs) && (initialRE != finalRE))))  {
    275275        errs() << mTransformationName << ":\n" << Printer_RE::PrintRE(finalRE) << '\n';
    276276    }
     
    278278}
    279279
    280 RE * RE_Transformer::transform(RE * const from) {
     280RE * RE_Transformer::transform(RE * const from) { assert (from);
    281281    using T = RE::ClassTypeId;
    282282    RE * to = from;
Note: See TracChangeset for help on using the changeset viewer.