Changeset 5797


Ignore:
Timestamp:
Dec 21, 2017, 9:01:44 PM (6 months ago)
Author:
cameron
Message:

makeByte uses the Byte alphabet

Location:
icGREP/icgrep-devel/icgrep
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/cc/alphabet.cpp

    r5796 r5797  
    4242
    4343const CodeUnitAlphabet Byte("Byte", 8);
    44 
     44   
     45const CodeUnitAlphabet UTF16("UTF16", 16);
     46   
    4547}
  • icGREP/icgrep-devel/icgrep/cc/alphabet.h

    r5796 r5797  
    3131public:
    3232    //  Alphabets may be formed by some subset of Unicode characters, together
    33     //  with a mapping to and from Unicode.  The mapping is defined in terms of the
    34     //  number of character codes unicodeCommon such that all character codes in the range
     33    //  with a mapping to and from Unicode.  The mapping is defined in terms of unicodeCommon:
     34    //  the number of character codes (if any) such that all character codes in the range
    3535    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
    3636    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
     
    6464//  Some important alphabets are predefined.
    6565
    66 const extern UnicodeMappableAlphabet Unicode;
     66const extern UnicodeMappableAlphabet Unicode; // Unicode("Unicode", UCD::UNICODE_MAX, {})
    6767
    68 const extern UnicodeMappableAlphabet ASCII;
     68const extern UnicodeMappableAlphabet ASCII;  // ASCII("ASCII", 0x7F, {});
    6969
    70 const extern UnicodeMappableAlphabet ISO_Latin1;
     70const extern UnicodeMappableAlphabet ISO_Latin1; // ISO_Latin1("ISO_Latin1", 0xFF, {});
    7171
    72 const extern CodeUnitAlphabet Byte;
    73 
     72const extern CodeUnitAlphabet Byte; // Byte("Byte", 8);
     73   
     74const extern CodeUnitAlphabet UTF16; // UTF16("UTF16", 16);
     75   
    7476}
    7577
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5793 r5797  
    1717#include <pablo/pe_count.h>
    1818#include <pablo/pe_matchstar.h>
    19 #include "cc/cc_compiler.h"         // for CC_Compiler
    20 
     19#include <cc/cc_compiler.h>         // for CC_Compiler
     20#include <cc/alphabet.h>
    2121#include <llvm/Support/raw_ostream.h>
    2222
     
    4242    auto & pb = ccc.getBuilder();
    4343    Zeroes * const ZEROES = pb.createZeroes();
    44     PabloAST * const u8pfx = ccc.compileCC(makeCC(0xC0, 0xFF));
     44    PabloAST * const u8pfx = ccc.compileCC(makeByte(0xC0, 0xFF));
    4545
    4646
     
    5252
    5353    pb.createIf(u8pfx, it);
    54     PabloAST * const u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF), it);
    55     PabloAST * const u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF), it);
    56     PabloAST * const u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4), it);
    57     PabloAST * const u8suffix = ccc.compileCC("u8suffix", makeCC(0x80, 0xBF), it);
     54    PabloAST * const u8pfx2 = ccc.compileCC(makeByte(0xC2, 0xDF), it);
     55    PabloAST * const u8pfx3 = ccc.compileCC(makeByte(0xE0, 0xEF), it);
     56    PabloAST * const u8pfx4 = ccc.compileCC(makeByte(0xF0, 0xF4), it);
     57    PabloAST * const u8suffix = ccc.compileCC("u8suffix", makeByte(0x80, 0xBF), it);
    5858   
    5959    //
     
    7474    PabloAST * const u8scope3X = it3.createOr(u8scope32, u8scope33);
    7575    it3.createAssign(anyscope, it3.createOr(anyscope, u8scope3X));
    76     PabloAST * const E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xE0), it3), 1), ccc.compileCC(makeCC(0x80, 0x9F), it3));
    77     PabloAST * const ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xED), it3), 1), ccc.compileCC(makeCC(0xA0, 0xBF), it3));
     76    PabloAST * const E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE0), it3), 1), ccc.compileCC(makeByte(0x80, 0x9F), it3));
     77    PabloAST * const ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xED), it3), 1), ccc.compileCC(makeByte(0xA0, 0xBF), it3));
    7878    PabloAST * const EX_invalid = it3.createOr(E0_invalid, ED_invalid);
    7979    it3.createAssign(EF_invalid, EX_invalid);
     
    9191    PabloAST * const u8scope4X = it4.createOr(u8scope4nonfinal, u8scope44);
    9292    it4.createAssign(anyscope, it4.createOr(anyscope, u8scope4X));
    93     PabloAST * const F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeCC(0xF0), it4), 1), ccc.compileCC(makeCC(0x80, 0x8F), it4));
    94     PabloAST * const F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeCC(0xF4), it4), 1), ccc.compileCC(makeCC(0x90, 0xBF), it4));
     93    PabloAST * const F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF0), it4), 1), ccc.compileCC(makeByte(0x80, 0x8F), it4));
     94    PabloAST * const F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeByte(0xF4), it4), 1), ccc.compileCC(makeByte(0x90, 0xBF), it4));
    9595    PabloAST * const FX_invalid = it4.createOr(F0_invalid, F4_invalid);
    9696    it4.createAssign(EF_invalid, it4.createOr(EF_invalid, FX_invalid));
     
    112112    it.createAssign(nonFinal, it.createAnd(nonFinal, u8valid));
    113113   
    114     PabloAST * u8single = pb.createAnd(ccc.compileCC(makeCC(0x00, 0x7F)), pb.createNot(u8invalid));
     114    PabloAST * u8single = pb.createAnd(ccc.compileCC(makeByte(0x00, 0x7F)), pb.createNot(u8invalid));
    115115    PabloAST * const initial = pb.createOr(u8single, valid_pfx, "initial");
    116116    PabloAST * const final = pb.createNot(pb.createOr(nonFinal, u8invalid), "final");
     
    137137    auto & pb = ccc.getBuilder();
    138138   
    139     PabloAST * u16hi_hi_surrogate = ccc.compileCC(makeCC(0xD800, 0xDBFF));    //u16hi_hi_surrogate = [\xD8-\xDB]
    140     PabloAST * u16hi_lo_surrogate = ccc.compileCC(makeCC(0xDC00, 0xDFFF));    //u16hi_lo_surrogate = [\xDC-\xDF]
     139    PabloAST * u16hi_hi_surrogate = ccc.compileCC(makeCC(0xD800, 0xDBFF, &cc::UTF16));    //u16hi_hi_surrogate = [\xD8-\xDB]
     140    PabloAST * u16hi_lo_surrogate = ccc.compileCC(makeCC(0xDC00, 0xDFFF, &cc::UTF16));    //u16hi_lo_surrogate = [\xDC-\xDF]
    141141   
    142142    PabloAST * invalidTemp = pb.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
     
    146146    PabloAST * nonFinal = pb.createAnd(u16hi_hi_surrogate, u16valid, "nonfinal");
    147147
    148     PabloAST * u16single_temp = pb.createOr(ccc.compileCC(makeCC(0x0000, 0xD7FF)), ccc.compileCC(makeCC(0xE000, 0xFFFF)));
     148    PabloAST * u16single_temp = pb.createOr(ccc.compileCC(makeCC(0x0000, 0xD7FF, &cc::UTF16)), ccc.compileCC(makeCC(0xE000, 0xFFFF, &cc::UTF16)));
    149149    PabloAST * u16single = pb.createAnd(u16single_temp, pb.createNot(u16invalid));
    150150
  • icGREP/icgrep-devel/icgrep/kernels/linebreak_kernel.cpp

    r5793 r5797  
    3535    CC_Compiler ccc(this, getInput(0));
    3636    auto & pb = ccc.getBuilder();
    37     PabloAST * LF = ccc.compileCC("LF", makeCC(0x0A), pb);
     37    PabloAST * LF = ccc.compileCC("LF", makeByte(0x0A), pb);
    3838    pb.createAssign(pb.createExtract(getOutput(0), pb.getInteger(0)), LF);
    3939}
     
    5757
    5858    PabloAST * const LF = pb.createExtract(getInput(1), ZERO, "LF");
    59     PabloAST * const CR = ccc.compileCC(makeCC(0x0D));
    60     PabloAST * const LF_VT_FF_CR = ccc.compileCC("LF,VT,FF,CR", makeCC(0x0A, 0x0D), pb);
     59    PabloAST * const CR = ccc.compileCC(makeByte(0x0D));
     60    PabloAST * const LF_VT_FF_CR = ccc.compileCC("LF,VT,FF,CR", makeByte(0x0A, 0x0D), pb);
    6161    Var * const LineBreak = pb.createVar("LineBreak", LF_VT_FF_CR);
    6262
     
    7474
    7575    // Check for Unicode Line Breaks
    76     PabloAST * u8pfx = ccc.compileCC(makeCC(0xC0, 0xFF));
     76    PabloAST * u8pfx = ccc.compileCC(makeByte(0xC0, 0xFF));
    7777    PabloBuilder it = PabloBuilder::Create(pb);
    7878    pb.createIf(u8pfx, it);
    79     PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF), it);
    80     PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF), it);
     79    PabloAST * u8pfx2 = ccc.compileCC(makeByte(0xC2, 0xDF), it);
     80    PabloAST * u8pfx3 = ccc.compileCC(makeByte(0xE0, 0xEF), it);
    8181
    8282    // Two-byte sequences
    8383    PabloBuilder it2 = PabloBuilder::Create(it);
    8484    it.createIf(u8pfx2, it2);
    85     PabloAST * NEL = it2.createAnd(it2.createAdvance(ccc.compileCC(makeCC(0xC2), it2), 1), ccc.compileCC(makeCC(0x85), it2), "NEL");
     85    PabloAST * NEL = it2.createAnd(it2.createAdvance(ccc.compileCC(makeByte(0xC2), it2), 1), ccc.compileCC(makeByte(0x85), it2), "NEL");
    8686    it2.createAssign(LineBreak, it2.createOr(LineBreak, NEL));
    8787
     
    8989    PabloBuilder it3 = PabloBuilder::Create(it);
    9090    it.createIf(u8pfx3, it3);
    91     PabloAST * E2_80 = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xE2), it3), 1), ccc.compileCC(makeCC(0x80), it3));
    92     PabloAST * LS_PS = it3.createAnd(it3.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9), it3), "LS_PS");
     91    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(ccc.compileCC(makeByte(0xE2), it3), 1), ccc.compileCC(makeByte(0x80), it3));
     92    PabloAST * LS_PS = it3.createAnd(it3.createAdvance(E2_80, 1), ccc.compileCC(makeByte(0xA8,0xA9), it3), "LS_PS");
    9393    it3.createAssign(LineBreak, it3.createOr(LineBreak, LS_PS));
    9494
  • icGREP/icgrep-devel/icgrep/re/re_cc.h

    r5795 r5797  
    5353    friend CC * subtractCC(const CC * a, const CC * b);
    5454    friend CC * intersectCC(const CC * a, const CC * b);
     55    friend CC * makeByte(const codepoint_t codepoint);
     56    friend CC * makeByte(const codepoint_t lo, const codepoint_t hi);
    5557
    5658    CC(const cc::Alphabet * alphabet);
     
    109111}
    110112
    111     inline CC * makeCC(const codepoint_t codepoint, const cc::Alphabet * alphabet = &cc::Unicode) {
     113inline CC * makeCC(const codepoint_t codepoint, const cc::Alphabet * alphabet = &cc::Unicode) {
    112114    return new CC(codepoint, alphabet);
    113115}
     
    143145}
    144146
     147inline CC * makeByte(const codepoint_t codepoint) {
     148    return new CC(codepoint, &cc::Byte);
     149}
     150
     151inline CC * makeByte(const codepoint_t lo, const codepoint_t hi) {
     152    return new CC(lo, hi, &cc::Byte);
     153}
     154   
    145155}
    146156
  • icGREP/icgrep-devel/icgrep/re/re_name.h

    r5781 r5797  
    4646    friend Name * makeZeroWidth(const std::string & name, RE * zerowidth);
    4747    friend Name * makeName(CC * const cc);
    48     friend Name * makeByte(CC * const cc);
    4948    friend Name * makeName(const std::string &, Type);
    5049    friend Name * makeName(const std::string &, const std::string &, Type);
     
    163162}
    164163
    165 inline Name * makeByte(CC * const cc) {
    166     assert(cc->max_codepoint() <= 0xFF);
    167     const std::string name = cc->canonicalName(CC_type::ByteClass);
    168     return new Name(nullptr, 0, name.c_str(), name.length(), Name::Type::Byte, cc);
    169 }
    170    
    171     inline Name * makeCapture(const std::string & name, RE * captured) {
     164inline Name * makeCapture(const std::string & name, RE * captured) {
    172165    return new Name(nullptr, 0, name.c_str(), name.length(), Name::Type::Capture, captured);
    173166}
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5792 r5797  
    123123    if ((flags & ModeFlagType::MULTILINE_MODE_FLAG) == 0) return makeZeroWidth("^s");  //single-line mode
    124124    if ((flags & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) {
    125         return makeNegativeLookBehindAssertion(makeByte(makeCC(makeCC(0, '\n'-1), makeCC('\n'+1, 0xFF))));
     125        return makeNegativeLookBehindAssertion(makeCC(makeByte(0, '\n'-1), makeByte('\n'+1, 0xFF)));
    126126    }
    127127    return makeStart();
     
    130130    if ((flags & ModeFlagType::MULTILINE_MODE_FLAG) == 0) return makeZeroWidth("$s");  //single-line mode
    131131    if ((flags & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) {
    132         return makeNegativeLookAheadAssertion(makeByte(makeCC(makeCC(0, '\n'-1), makeCC('\n'+1, 0xFF))));
     132        return makeNegativeLookAheadAssertion(makeCC(makeByte(0, '\n'-1), makeByte('\n'+1, 0xFF)));
    133133    }
    134134    return makeEnd();
     
    349349        codepoint_t cp = parse_escaped_codepoint();
    350350        if ((cp >= 0x80) && (cp <= 0xFF)) {
    351             return makeByte(makeCC(cp));
     351            return makeByte(cp);
    352352        }
    353353        else return createCC(cp);
  • icGREP/icgrep-devel/icgrep/u8u16.cpp

    r5757 r5797  
    9292
    9393    // The logic for processing non-ASCII bytes will be embedded within an if-hierarchy.
    94     PabloAST * nonASCII = ccc.compileCC(re::makeCC(0x80, 0xFF));
     94    PabloAST * nonASCII = ccc.compileCC(re::makeByte(0x80, 0xFF));
    9595
    9696    // Builder for the if statement handling all non-ASCII logic
     
    105105
    106106    // Entry condition for 3 or 4 byte sequences: we have a prefix byte in the range 0xE0-0xFF.
    107     PabloAST * pfx34 = ccc.compileCC(re::makeCC(0xE0, 0xFF), nAb);
     107    PabloAST * pfx34 = ccc.compileCC(re::makeByte(0xE0, 0xFF), nAb);
    108108    // Builder for the if statement handling all logic for 3- and 4-byte sequences.
    109109    PabloBuilder p34b = PabloBuilder::Create(nAb);
     
    124124    //
    125125    // Entry condition  or 4 byte sequences: we have a prefix byte in the range 0xF0-0xFF.
    126     PabloAST * pfx4 = ccc.compileCC(re::makeCC(0xF0, 0xFF), p34b);
     126    PabloAST * pfx4 = ccc.compileCC(re::makeByte(0xF0, 0xFF), p34b);
    127127    // Builder for the if statement handling all logic for 4-byte sequences only.
    128128    PabloBuilder p4b = PabloBuilder::Create(p34b);
    129129    // Illegal 4-byte sequences
    130     PabloAST * F0 = ccc.compileCC(re::makeCC(0xF0), p4b);
    131     PabloAST * F4 = ccc.compileCC(re::makeCC(0xF4), p4b);
    132     PabloAST * F0_err = p4b.createAnd(p4b.createAdvance(F0, 1), ccc.compileCC(re::makeCC(0x80, 0x8F), p4b));
    133     PabloAST * F4_err = p4b.createAnd(p4b.createAdvance(F4, 1), ccc.compileCC(re::makeCC(0x90, 0xBF), p4b));
    134     PabloAST * F5_FF = ccc.compileCC(re::makeCC(0xF5, 0xFF), p4b);
     130    PabloAST * F0 = ccc.compileCC(re::makeByte(0xF0), p4b);
     131    PabloAST * F4 = ccc.compileCC(re::makeByte(0xF4), p4b);
     132    PabloAST * F0_err = p4b.createAnd(p4b.createAdvance(F0, 1), ccc.compileCC(re::makeByte(0x80, 0x8F), p4b));
     133    PabloAST * F4_err = p4b.createAnd(p4b.createAdvance(F4, 1), ccc.compileCC(re::makeByte(0x90, 0xBF), p4b));
     134    PabloAST * F5_FF = ccc.compileCC(re::makeByte(0xF5, 0xFF), p4b);
    135135
    136136    Var * FX_err = p34b.createVar("FX_err", zeroes);
     
    181181    // Combined logic for 3 and 4 byte sequences
    182182    //
    183     PabloAST * pfx3 = ccc.compileCC(re::makeCC(0xE0, 0xEF), p34b);
     183    PabloAST * pfx3 = ccc.compileCC(re::makeByte(0xE0, 0xEF), p34b);
    184184
    185185    p34b.createAssign(u8scope32, p34b.createAdvance(pfx3, 1));
     
    187187
    188188    // Illegal 3-byte sequences
    189     PabloAST * E0 = ccc.compileCC(re::makeCC(0xE0), p34b);
    190     PabloAST * ED = ccc.compileCC(re::makeCC(0xED), p34b);
    191     PabloAST * E0_err = p34b.createAnd(p34b.createAdvance(E0, 1), ccc.compileCC(re::makeCC(0x80, 0x9F), p34b));
    192     PabloAST * ED_err = p34b.createAnd(p34b.createAdvance(ED, 1), ccc.compileCC(re::makeCC(0xA0, 0xBF), p34b));
     189    PabloAST * E0 = ccc.compileCC(re::makeByte(0xE0), p34b);
     190    PabloAST * ED = ccc.compileCC(re::makeByte(0xED), p34b);
     191    PabloAST * E0_err = p34b.createAnd(p34b.createAdvance(E0, 1), ccc.compileCC(re::makeByte(0x80, 0x9F), p34b));
     192    PabloAST * ED_err = p34b.createAnd(p34b.createAdvance(ED, 1), ccc.compileCC(re::makeByte(0xA0, 0xBF), p34b));
    193193    Var * EX_FX_err = nAb.createVar("EX_FX_err", zeroes);
    194194
     
    217217    Var * u8lastscope = main.createVar("u8lastscope", zeroes);
    218218
    219     PabloAST * pfx2 = ccc.compileCC(re::makeCC(0xC0, 0xDF), nAb);
     219    PabloAST * pfx2 = ccc.compileCC(re::makeByte(0xC0, 0xDF), nAb);
    220220    PabloAST * u8scope22 = nAb.createAdvance(pfx2, 1);
    221221    nAb.createAssign(u8lastscope, nAb.createOr(u8scope22, nAb.createOr(u8scope33, u8scope44)));
    222222    PabloAST * u8anyscope = nAb.createOr(u8lastscope, p34del);
    223223
    224     PabloAST * C0_C1_err = ccc.compileCC(re::makeCC(0xC0, 0xC1), nAb);
    225     PabloAST * scope_suffix_mismatch = nAb.createXor(u8anyscope, ccc.compileCC(re::makeCC(0x80, 0xBF), nAb));
     224    PabloAST * C0_C1_err = ccc.compileCC(re::makeByte(0xC0, 0xC1), nAb);
     225    PabloAST * scope_suffix_mismatch = nAb.createXor(u8anyscope, ccc.compileCC(re::makeByte(0x80, 0xBF), nAb));
    226226    nAb.createAssign(error_mask, nAb.createOr(scope_suffix_mismatch, nAb.createOr(C0_C1_err, EX_FX_err)));
    227     nAb.createAssign(delmask, nAb.createOr(p34del, ccc.compileCC(re::makeCC(0xC0, 0xFF), nAb)));
     227    nAb.createAssign(delmask, nAb.createOr(p34del, ccc.compileCC(re::makeByte(0xC0, 0xFF), nAb)));
    228228
    229229    // The low 3 bits of the high byte of the UTF-16 code unit as well as the high bit of the
     
    241241    //
    242242    //
    243     PabloAST * ASCII = ccc.compileCC(re::makeCC(0x0, 0x7F));
     243    PabloAST * ASCII = ccc.compileCC(re::makeByte(0x0, 0x7F));
    244244    PabloAST * last_byte = main.createOr(ASCII, u8lastscope);
    245245    main.createAssign(u16_lo[1], main.createOr(main.createAnd(ASCII, u8_bits[1]), p234_lo1));
Note: See TracChangeset for help on using the changeset viewer.