Changeset 3934 for icGREP


Ignore:
Timestamp:
Jul 30, 2014, 9:40:39 AM (5 years ago)
Author:
daled
Message:

icGREP now accepts utf8 encoded characters in a regular expression.

Location:
icGREP/icgrep-devel/icgrep
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re_parser.cpp

    r3914 r3934  
    381381    }
    382382
     383    int next_byte = (s.operator [](0) & 0xFF);
     384    if ((next_byte >= 0xC0) && (next_byte <= 0xDF))
     385    {       
     386        cc_retVal = parse_utf8_bytes(1, s);
     387    }
     388    else if ((next_byte >= 0xE0) && (next_byte <= 0xEF))
     389    {
     390        cc_retVal = parse_utf8_bytes(2, s);
     391    }
     392    else if((next_byte >= 0xF0) && (next_byte <= 0xFF))
     393    {
     394        cc_retVal = parse_utf8_bytes(3, s);
     395    }
     396
    383397    return cc_retVal;
     398}
     399
     400parse_result_retVal RE_Parser::parse_utf8_bytes(int suffix_count, std::string s)
     401{
     402    CC* cc = new CC((s.operator [](0) & 0xFF));
     403    Seq* seq = new Seq();
     404    seq->setType(Seq::Byte);
     405    seq->AddREListItem(cc);
     406
     407    return parse_utf8_suffix_byte(suffix_count, s.substr(1, s.length() - 1), seq);
     408}
     409
     410parse_result_retVal RE_Parser::parse_utf8_suffix_byte(int suffix_byte_num, std::string s, Seq *seq_sofar)
     411{
     412    parse_result_retVal result_RetVal;
     413
     414    if (suffix_byte_num == 0)
     415    {
     416        result_RetVal.result = new ParseSuccess(seq_sofar);
     417        result_RetVal.remaining = s;
     418    }
     419    else if (s.length() == 0)
     420    {
     421        result_RetVal.result = new ParseFailure("Invalid UTF-8 encoding!");
     422        result_RetVal.remaining = "";
     423    }
     424    else
     425    {
     426        if ((s.operator [](0) & 0xC0) == 0x80)
     427        {
     428            CC* cc = new CC((s.operator [](0) & 0xFF));
     429            seq_sofar->AddREListItem(cc);
     430            suffix_byte_num--;
     431            result_RetVal = parse_utf8_suffix_byte(suffix_byte_num, s.substr(1, s.length() - 1), seq_sofar);
     432        }
     433        else
     434        {
     435            result_RetVal.result = new ParseFailure("Invalid UTF-8 encoding!");
     436            result_RetVal.remaining = s;
     437        }
     438    }
     439
     440    return result_RetVal;
    384441}
    385442
  • icGREP/icgrep-devel/icgrep/re_parser.h

    r3914 r3934  
    6161    static parse_result_retVal parse_cc_body0(std::string s, CC* cc_sofar);
    6262    static parse_result_retVal parse_cc_body1(int chr, std::string s, CC* cc_sofar);
     63    static parse_result_retVal parse_utf8_bytes(int suffix_count, std::string s);
     64    static parse_result_retVal parse_utf8_suffix_byte(int suffix_byte_num, std::string s, Seq* seq_sofar);
    6365
    6466    static parse_int_retVal parse_hex(std::string s);
  • icGREP/icgrep-devel/icgrep/re_seq.cpp

    r3914 r3934  
    1010{
    1111    mList = new std::list<RE*>();
     12    mType = Seq::Normal;
    1213}
    1314
     
    1920    mList->assign(it, lst->end());
    2021    mList->reverse();
     22    mType = Seq::Normal;
    2123}
    2224
     
    2830    mList->assign(it, lst.end());
    2931    mList->reverse();
     32    mType = Seq::Normal;
    3033}
    3134
     
    4649}
    4750
     51Seq::Type Seq::getType()
     52{
     53    return mType;
     54}
     55
     56void Seq::setType(Seq::Type type)
     57{
     58    mType = type;
     59}
  • icGREP/icgrep-devel/icgrep/re_seq.h

    r3914 r3934  
    1414{
    1515public:
     16    typedef enum {Normal,Byte} Type;
    1617    Seq();
    1718    Seq(std::list<RE*>* lst);
     
    2021    std::list<RE*>* GetREList();
    2122    void AddREListItem(RE *re);
     23    Type getType();
     24    void setType(Type type);
    2225private:
    2326    std::list<RE*>* mList;
     27    Type mType;
    2428};
    2529
  • icGREP/icgrep-devel/icgrep/utf8_encoder.cpp

    r3916 r3934  
    2626    else if (Seq* re_seq = dynamic_cast<Seq*>(re))
    2727    {
     28
    2829        std::list<RE*> re_list;
    2930        std::list<RE*>::iterator it;
     
    3132        for (it = re_seq->GetREList()->begin(); it != re_seq->GetREList()->end(); ++it)
    3233        {
    33             re_list.push_front(toUTF8(*it));
    34         }
    35 
    36         retVal = new Seq(&re_list);
     34            //If this is a previously encoded Unicode byte sequence.
     35            if (re_seq->getType() == Seq::Byte)
     36            {
     37                if (CC* seq_cc = dynamic_cast<CC*>(*it))
     38                {
     39                    CharSetItem item = seq_cc->getItems().front();
     40                    re_list.push_front(new CC(item.lo_codepoint));
     41                }
     42            }
     43            else
     44            {
     45                re_list.push_front(toUTF8(*it));
     46            }
     47        }
     48
     49        Seq* new_seq = new Seq(&re_list);
     50        new_seq->setType(re_seq->getType());
     51        retVal = new_seq;
    3752    }
    3853    else if (Rep* re_rep = dynamic_cast<Rep*>(re))
Note: See TracChangeset for help on using the changeset viewer.