Changeset 4509 for icGREP/icgrep-devel


Ignore:
Timestamp:
Feb 16, 2015, 2:30:45 PM (5 years ago)
Author:
cameron
Message:

Full UTF-8 validation of Initial/Final? streams

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4508 r4509  
    103103    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF), it);
    104104    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4), it);
    105     Assign * valid_pfx = it.createAssign("valid_pfx", it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4));
    106     PabloAST * u8scope32 = it.createAdvance(u8pfx3, 1);
    107     PabloAST * u8scope42 = it.createAdvance(u8pfx4, 1, "u8scope42");
    108     PabloAST * u8scope43 = it.createAdvance(u8scope42, 1);
    109     mNonFinal = it.createAssign("nonfinal", it.createOr(it.createOr(u8pfx, u8scope32), it.createOr(u8scope42, u8scope43)));
    110     PabloAST * NEL = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xC2), it), 1), ccc.compileCC(makeCC(0x85), it));
    111     PabloAST * E2_80 = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xE2), it), 1), ccc.compileCC(makeCC(0x80), it));
    112     PabloAST * LS_PS = it.createAnd(it.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9), it));
     105    Assign * u8suffix = it.createAssign("u8suffix", ccc.compileCC(makeCC(0x80, 0xBF)));
     106   
     107    //
     108    // Two-byte sequences
     109    PabloBlock & it2 = PabloBlock::Create(it);
     110    Assign * u8scope22 = it2.createAssign("u8scope22", it2.createAdvance(u8pfx2, 1));
     111    Assign * NEL = it2.createAssign("NEL", it2.createAnd(it2.createAdvance(ccc.compileCC(makeCC(0xC2), it2), 1), ccc.compileCC(makeCC(0x85), it2)));
     112    it.createIf(u8pfx2, std::move(std::vector<Assign *>{u8scope22, NEL}), it2);
     113   
     114    //
     115    // Three-byte sequences
     116    PabloBlock & it3 = PabloBlock::Create(it);
     117    Assign * u8scope32 = it3.createAssign("u8scope32", it3.createAdvance(u8pfx3, 1));
     118    PabloAST * u8scope33 = it3.createAdvance(u8pfx3, 2);
     119    Assign * u8scope3X = it3.createAssign("u8scope3X", it3.createOr(u8scope32, u8scope33));
     120    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xE2), it3), 1), ccc.compileCC(makeCC(0x80), it3));
     121    Assign * LS_PS = it3.createAssign("LS_PS", it3.createAnd(it3.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9), it3)));
     122    PabloAST * E0_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xE0), it3), 1), ccc.compileCC(makeCC(0x80, 0x9F), it3));
     123    PabloAST * ED_invalid = it3.createAnd(it3.createAdvance(ccc.compileCC(makeCC(0xED), it3), 1), ccc.compileCC(makeCC(0xA0, 0xBF), it3));
     124    Assign * EX_invalid = it3.createAssign("EX_invalid", it3.createOr(E0_invalid, ED_invalid));
     125    it.createIf(u8pfx3, std::move(std::vector<Assign *>{u8scope32, u8scope3X, LS_PS, EX_invalid}), it3);
     126 
     127    //
     128    // Four-byte sequences
     129    PabloBlock & it4 = PabloBlock::Create(it);
     130    PabloAST * u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
     131    PabloAST * u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
     132    PabloAST * u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
     133    Assign * u8scope4nonfinal = it4.createAssign("u8scope4nonfinal", it4.createOr(u8scope42, u8scope43));
     134    Assign * u8scope4X = it4.createAssign("u8scope4X", it4.createOr(u8scope4nonfinal, u8scope44));
     135    PabloAST * F0_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeCC(0xF0), it4), 1), ccc.compileCC(makeCC(0x80, 0x8F), it4));
     136    PabloAST * F4_invalid = it4.createAnd(it4.createAdvance(ccc.compileCC(makeCC(0xF4), it4), 1), ccc.compileCC(makeCC(0x90, 0xBF), it4));
     137    Assign * FX_invalid = it4.createAssign("FX_invalid", it4.createOr(F0_invalid, F4_invalid));
     138    it.createIf(u8pfx4, std::move(std::vector<Assign *>{u8scope4nonfinal, u8scope4X, FX_invalid}), it4);
     139
     140    //
     141    // Invalid cases
     142    PabloAST * anyscope = it.createOr(u8scope22, it.createOr(u8scope3X, u8scope4X));
     143    PabloAST * legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
     144    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
     145    //  a scope is a mismatch, i.e., invalid UTF-8.
     146    PabloAST * mismatch = it.createXor(anyscope, u8suffix);
     147    //
     148    PabloAST * EF_invalid = it.createOr(EX_invalid, FX_invalid);
     149    PabloAST * pfx_invalid = it.createXor(u8pfx, legalpfx);
     150    Assign * u8invalid = it.createAssign("u8invalid", it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
     151    //
     152    //
     153   
     154    Assign * valid_pfx = it.createAssign("valid_pfx", it.createAnd(u8pfx, it.createNot(u8invalid)));
     155    mNonFinal = it.createAssign("nonfinal", it.createAnd(it.createOr(it.createOr(u8pfx, u8scope32), u8scope4nonfinal), it.createNot(u8invalid)));
     156   
    113157    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
    114     mPB.createIf(u8pfx, std::move(std::vector<Assign *>{valid_pfx, mNonFinal, NEL_LS_PS}), it);
     158    mPB.createIf(u8pfx, std::move(std::vector<Assign *>{u8invalid, valid_pfx, mNonFinal, NEL_LS_PS}), it);
     159   
    115160    PabloAST * LB_chars = mPB.createOr(LF_VT_FF_CR, NEL_LS_PS);
    116 
    117     PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
     161    PabloAST * u8single = mPB.createAnd(ccc.compileCC(makeCC(0x00, 0x7F)), mPB.createNot(u8invalid));
    118162    mInitial = mPB.createOr(u8single, valid_pfx, "initial");
    119163    mUnicodeLineBreak = mPB.createAnd(LB_chars, mPB.createNot(mCRLF));  // count the CR, but not CRLF
Note: See TracChangeset for help on using the changeset viewer.