Ignore:
Timestamp:
Dec 31, 2017, 7:22:14 PM (17 months ago)
Author:
cameron
Message:

Supporting multiple alphabets in RE compilation - initial check-in

Location:
icGREP/icgrep-devel/icgrep/re
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/printer_re.cpp

    r5786 r5816  
    2222#include <re/re_assertion.h>
    2323#include <re/re_group.h>
     24#include <cc/alphabet.h>
    2425
    2526using namespace re;
     
    5455            retVal += "]";
    5556        }
     57        retVal += "/" + re_cc->getAlphabet()->getName();
    5658    } else if (const Name* re_name = dyn_cast<const Name>(re)) {
    5759        retVal = "Name \"";
     
    6264        retVal += re_name->getName();
    6365        retVal += "\" ";
    64         if (re_name->getType() == Name::Type::Capture) {
     66        //if (re_name->getType() == Name::Type::Capture) {
    6567            retVal += "=(" + PrintRE(re_name->getDefinition()) + ")";
    66         }
     68        //}
    6769    } else if (const Range* rg = dyn_cast<const Range>(re)) {
    6870        retVal = "Range (";
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5813 r5816  
    2929#include <re/to_utf8.h>
    3030#include <re/re_toolchain.h>        // for AlgorithmOptionIsSet, RE_Algorith...
    31 #include "cc/cc_compiler.h"         // for CC_Compiler
     31#include <cc/alphabet.h>
     32#include <cc/cc_compiler.h>
    3233#include "pablo/builder.hpp"        // for PabloBuilder
    3334#include <IR_Gen/idisa_target.h>    // for AVX2_available
     35#include <llvm/ADT/STLExtras.h> // for make_unique
     36#include <llvm/Support/raw_ostream.h>
    3437#include <llvm/Support/ErrorHandling.h>
    3538
    3639namespace pablo { class PabloAST; }
     40namespace pablo { class Var; }
    3741namespace pablo { class PabloKernel; }
    3842namespace re { class Alt; }
     
    4549
    4650namespace re {
     51
     52   
     53void RE_Compiler::addAlphabet(cc::Alphabet * a, pablo::Var * basis_set) {
     54    mAlphabets.push_back(a);
     55    mAlphabetCompilers.push_back(make_unique<cc::CC_Compiler>(mKernel, basis_set));
     56}
    4757
    4858using MarkerType = RE_Compiler::MarkerType;
     
    97107MarkerType RE_Compiler::compileCC(CC * cc, MarkerType marker, PabloBuilder & pb) {
    98108    PabloAST * nextPos = markerVar(marker);
    99     // If Unicode CCs weren't pulled out earlier, we generate the equivalent
    100     // byte sequence as an RE.
    101     if (cc->getAlphabet() == &cc::Unicode) {
    102          MarkerType m = compile(toUTF8(cc), pb);
    103          nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    104          return makeMarker(FinalMatchUnit, pb.createAnd(markerVar(m), nextPos));
    105     }
    106     if (isByteLength(cc)) {
     109    const cc::Alphabet * a = cc->getAlphabet();
     110    if (a == &cc::Byte) {
    107111        if (marker.pos == FinalMatchUnit) {
    108112            nextPos = pb.createAdvance(nextPos, 1);
    109113        }
     114        return makeMarker(FinalMatchUnit, pb.createAnd(nextPos, mCCCompiler.compileCC(cc, pb)));
     115    } else if (a == &cc::Unicode) {
     116        MarkerType m = compile(toUTF8(cc), pb);
     117        nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
     118        return makeMarker(FinalMatchUnit, pb.createAnd(markerVar(m), nextPos));
    110119    } else {
    111         nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    112     }
    113     return makeMarker(FinalMatchUnit, pb.createAnd(nextPos, mCCCompiler.compileCC(cc, pb)));
     120        if (isByteLength(cc)) {
     121            if (marker.pos == FinalMatchUnit) {
     122                nextPos = pb.createAdvance(nextPos, 1);
     123            }
     124        } else {
     125            nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
     126        }
     127        unsigned i = 0;
     128        while (i < mAlphabets.size() && (a != mAlphabets[i])) i++;
     129        if (i == mAlphabets.size()) llvm::report_fatal_error("Alphabet " + a->getName() + " has no CC compiler");
     130        return makeMarker(FinalMatchUnit, pb.createAnd(nextPos, mAlphabetCompilers[i]->compileCC(cc, pb)));
     131    }
    114132}
    115133
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r5812 r5816  
    1212#include <pablo/builder.hpp>
    1313#include <vector>       // for vector<>::iterator
    14 namespace cc { class CC_Compiler; }
     14namespace cc { class CC_Compiler; class Alphabet;}
    1515namespace pablo { class PabloAST; }
    1616namespace pablo { class PabloBuilder; }
    1717namespace pablo { class PabloKernel; }
     18namespace pablo { class Var; }
    1819namespace re { class Alt; }
    1920namespace re { class Assertion; }
     
    5152
    5253    RE_Compiler(pablo::PabloKernel * kernel, cc::CC_Compiler & ccCompiler);
     54   
     55    //
     56    // The CCs (character classes) within a regular expression are generally
     57    // expressed using a single alphabet.   But multiple alphabets may be
     58    // used under some circumstances.   For example, regular expressions for
     59    // Unicode may use both the Unicode alphabet for full Unicode characters
     60    // as well as the Byte alphabet for the individual code units of UTF-8.
     61    // In other cases, a multiplexed alphabet may be used for a certain
     62    // subexpression, for example, if the subexpression involves a local
     63    // language or a capture-backreference combination.
     64    //
     65    // Alphabets are added as needed using the addAlphabet method, giving both
     66    // the alphabet value and the set of parallel bit streams that comprise
     67    // a basis for the coded alphabet values.
     68   
     69    void addAlphabet(cc::Alphabet * a, pablo::Var * basis_set);
     70
    5371    pablo::PabloAST * compile(RE * re, pablo::PabloAST * const initialCursors = nullptr);
    5472
     
    113131
    114132    pablo::PabloKernel * const                      mKernel;
     133    std::vector<cc::Alphabet *>                     mAlphabets;
     134    std::vector<std::unique_ptr<cc::CC_Compiler>>   mAlphabetCompilers;
     135
    115136    bool                                            mCountOnly;
    116137    cc::CC_Compiler &                               mCCCompiler;
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5814 r5816  
    722722        else return createCC(cp);
    723723    }
    724     else return makeCC(parse_escaped_codepoint());
     724    else return createCC(parse_escaped_codepoint());
    725725}
    726726
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r5806 r5816  
    159159    Var * const basis = kernel->getInputStreamVar("basis");
    160160    cc::CC_Compiler cc_compiler(kernel, basis);
    161     // compile Unicode names
    162161    RE_Compiler re_compiler(kernel, cc_compiler);
    163162    return re_compiler.compile(re_ast);
Note: See TracChangeset for help on using the changeset viewer.