Changeset 4402


Ignore:
Timestamp:
Jan 6, 2015, 8:40:56 AM (4 years ago)
Author:
cameron
Message:

Parsing support for word boundary assertions \b, \B

Location:
icGREP/icgrep-devel
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/QA/greptest.xml

    r4341 r4402  
    4949MazazazazazazazT
    5050</datafile>
     51
    5152
    5253<datafile id="StartEndAlt">
     
    397398<grepcase regexp="h.t" datafile="RangeAltSeqMatchStarKplusWhileNotOptAny" grepcount="3" />
    398399<grepcase regexp="do*?c|ez*?t" datafile="RangeAltSeqMatchStarKplusWhileNotOptAny" grepcount="4"/>
     400<grepcase regexp="^.....\b" datafile="RangeAltSeqMatchStarKplusWhileNotOptAny" grepcount="6"/>>
    399401
    400402<grepcase regexp="[]]" datafile="special_characters" grepcount="9"/>
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4394 r4402  
    11/*
    2  *  Copyright (c) 2014 International Characters.
     2 *  Copyright (c) 2015 International Characters.
    33 *  This software is licensed to the public under the Open Software License 3.0.
    44 *  icgrep is a trademark of International Characters.
     
    349349
    350350#define bit40(x) (1ULL << ((x) - 0x40))
    351 const uint64_t setEscapeCharacters = bit40('p') | bit40('d') | bit40('w') | bit40('s') | bit40('P') | bit40('D') | bit40('W') | bit40('S');
     351const uint64_t setEscapeCharacters = bit40('b') | bit40('p') | bit40('d') | bit40('w') | bit40('s') |
     352                                     bit40('B') | bit40('P') | bit40('D') | bit40('W') | bit40('S');
    352353
    353354inline bool isSetEscapeChar(char c) {
     
    379380}
    380381
     382RE * makeWordBoundary () {
     383    RE * wordC = makeWordSet();
     384    std::vector<RE *> alts = {makeIntersect(makeLookAheadAssertion(wordC), makeNegativeLookBehindAssertion(wordC)),
     385        makeIntersect(makeNegativeLookAheadAssertion(wordC), makeLookBehindAssertion(wordC))};
     386    return makeAlt(alts.begin(), alts.end());
     387}
     388
     389RE * makeWordNonBoundary () {
     390    RE * wordC = makeWordSet();
     391    std::vector<RE *> alts = {makeIntersect(makeLookAheadAssertion(wordC), makeLookBehindAssertion(wordC)),
     392        makeIntersect(makeNegativeLookAheadAssertion(wordC), makeNegativeLookBehindAssertion(wordC))};
     393    return makeAlt(alts.begin(), alts.end());
     394}
     395
    381396RE * RE_Parser::parse_escaped_set() {
    382397    bool complemented = false;
    383398    RE * s;
    384399    switch (*_cursor) {
     400        case 'b':
     401            ++_cursor;
     402            return makeWordBoundary();
     403        case 'B':
     404            ++_cursor;
     405            return makeWordNonBoundary();
    385406        case 'd':
    386407            ++_cursor;
     
    698719
    699720// A backslash escape was found, and various special cases (back reference,
    700 // quoting with \Q, \E, sets (\p, \P, \d, \D, \w, \W, \s, \S), grapheme
     721// quoting with \Q, \E, sets (\p, \P, \d, \D, \w, \W, \s, \S, \b, \B), grapheme
    701722// cluster \X have been ruled out.
    702723// It may be one of several possibilities or an error sequence.
    703 // 1. Special control codes (\a, \b, \e, \f, \n, \r, \t, \v)
     724// 1. Special control codes (\a, \e, \f, \n, \r, \t, \v)
    704725// 2. General control codes c[@-_a-z?]
    705726// 3. Restricted octal notation 0 - 0777
     
    714735    switch (*_cursor) {
    715736        case 'a': ++_cursor; return 0x07; // BEL
    716         case 'b': ++_cursor; return 0x08; // BS
    717737        case 'e': ++_cursor; return 0x1B; // ESC
    718738        case 'f': ++_cursor; return 0x0C; // FF
Note: See TracChangeset for help on using the changeset viewer.