Ignore:
Timestamp:
Jul 30, 2014, 9:40:39 AM (5 years ago)
Author:
daled
Message:

icGREP now accepts utf8 encoded characters in a regular expression.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re_parser.cpp

    r3914 r3934  
    381381    }
    382382
     383    int next_byte = (s.operator [](0) & 0xFF);
     384    if ((next_byte >= 0xC0) && (next_byte <= 0xDF))
     385    {       
     386        cc_retVal = parse_utf8_bytes(1, s);
     387    }
     388    else if ((next_byte >= 0xE0) && (next_byte <= 0xEF))
     389    {
     390        cc_retVal = parse_utf8_bytes(2, s);
     391    }
     392    else if((next_byte >= 0xF0) && (next_byte <= 0xFF))
     393    {
     394        cc_retVal = parse_utf8_bytes(3, s);
     395    }
     396
    383397    return cc_retVal;
     398}
     399
     400parse_result_retVal RE_Parser::parse_utf8_bytes(int suffix_count, std::string s)
     401{
     402    CC* cc = new CC((s.operator [](0) & 0xFF));
     403    Seq* seq = new Seq();
     404    seq->setType(Seq::Byte);
     405    seq->AddREListItem(cc);
     406
     407    return parse_utf8_suffix_byte(suffix_count, s.substr(1, s.length() - 1), seq);
     408}
     409
     410parse_result_retVal RE_Parser::parse_utf8_suffix_byte(int suffix_byte_num, std::string s, Seq *seq_sofar)
     411{
     412    parse_result_retVal result_RetVal;
     413
     414    if (suffix_byte_num == 0)
     415    {
     416        result_RetVal.result = new ParseSuccess(seq_sofar);
     417        result_RetVal.remaining = s;
     418    }
     419    else if (s.length() == 0)
     420    {
     421        result_RetVal.result = new ParseFailure("Invalid UTF-8 encoding!");
     422        result_RetVal.remaining = "";
     423    }
     424    else
     425    {
     426        if ((s.operator [](0) & 0xC0) == 0x80)
     427        {
     428            CC* cc = new CC((s.operator [](0) & 0xFF));
     429            seq_sofar->AddREListItem(cc);
     430            suffix_byte_num--;
     431            result_RetVal = parse_utf8_suffix_byte(suffix_byte_num, s.substr(1, s.length() - 1), seq_sofar);
     432        }
     433        else
     434        {
     435            result_RetVal.result = new ParseFailure("Invalid UTF-8 encoding!");
     436            result_RetVal.remaining = s;
     437        }
     438    }
     439
     440    return result_RetVal;
    384441}
    385442
Note: See TracChangeset for help on using the changeset viewer.