Changeset 3961 for icGREP


Ignore:
Timestamp:
Aug 5, 2014, 2:37:07 PM (5 years ago)
Author:
daled
Message:

Multibyte character code classes parsed from hex notation are now using named byte sequences.

Location:
icGREP/icgrep-devel/icgrep
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/pbix_compiler.cpp

    r3958 r3961  
    224224            else //Name::unicode
    225225            {
    226                 std::string t_retVal = symgen.gensym("t");
    227                 std::string u_retVal = symgen.gensym("u");
    228                 std::string v_retVal = symgen.gensym("v");
    229                 std::string new_cur_retVal = symgen.gensym("new_cur");
    230 
    231                 cg_state.stmtsl.push_back(new Assign(t_retVal, new Or(new CharClass(m_name_map.find("internal.nonfinal")->second), new CharClass(rep_name->getName()))));
    232                 cg_state.stmtsl.push_back(new Assign(u_retVal, new MatchStar(new Var(cg_state.newsym), new Var(t_retVal))));
    233                 cg_state.stmtsl.push_back(new Assign(v_retVal, new And(new Var(u_retVal), new CharClass(m_name_map.find("internal.initial")->second))));
    234                 cg_state.stmtsl.push_back(new Assign(new_cur_retVal, new And(new Var(u_retVal), new Not(new Var(t_retVal)))));
    235 
    236                 cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(v_retVal), new Var(new_cur_retVal))));
     226                cg_state.stmtsl.push_back(new Assign(gs_retVal,
     227                    new And(new MatchStar(new Var(cg_state.newsym), new Or(new CharClass(m_name_map.find("internal.nonfinal")->second),
     228                                                                           new CharClass(rep_name->getName()))), new CharClass(m_name_map.find("internal.initial")->second))));
    237229            }
    238230
  • icGREP/icgrep-devel/icgrep/pe_scanthru.h

    r3955 r3961  
     1/*
     2 *  Copyright (c) 2014 International Characters.
     3 *  This software is licensed to the public under the Open Software License 3.0.
     4 *  icgrep is a trademark of International Characters.
     5 */
     6
    17#ifndef PS_SCANTHRU_H
    28#define PS_SCANTHRU_H
  • icGREP/icgrep-devel/icgrep/re_compiler.cpp

    r3956 r3961  
    7878    name_map.insert(make_pair("LineFeed", cc_name));
    7979
    80     CC* cc_utf8_single_byte = new CC(0x80, 0xBF);
     80    CC* cc_utf8_single_byte = new CC(0x00, 0x7F);
    8181    cc_name = cc_utf8_single_byte->getName();
    8282    re_map.insert(make_pair(cc_name, cc_utf8_single_byte));
  • icGREP/icgrep-devel/icgrep/utf8_encoder.cpp

    r3940 r3961  
    129129    {
    130130        Seq* seq = new Seq();
     131        seq->setType((u8Prefix(hbyte) ? Seq::Byte : Seq::Normal));
    131132        seq->AddREListItem(makeByteClass(hbyte));
    132133        seq->AddREListItem(rangeToUTF8_helper(lo, hi, n+1, hlen));
     
    165166}
    166167
     168bool UTF8_Encoder::u8Prefix(int cp)
     169{
     170    bool retVal = false;
     171
     172    if ((cp >= 0xC2) && (cp <= 0xDF))
     173    {
     174        retVal = true;
     175    }
     176    else if ((cp >= 0xE0) && (cp <= 0xEF))
     177    {
     178        retVal = true;
     179    }
     180    else if ((cp >= 0xF0) && (cp <= 0xF4))
     181    {
     182        retVal = true;
     183    }
     184
     185    return retVal;
     186}
     187
    167188CC* UTF8_Encoder::makeByteRange(int lo, int hi)
    168189{
  • icGREP/icgrep-devel/icgrep/utf8_encoder.h

    r3935 r3961  
    3030    static CC* makeByteRange(int lo, int hi);
    3131
     32    static bool u8Prefix(int cp);
    3233    static int u8len(int cp);
    3334    static int max_of_u8len(int lgth);
Note: See TracChangeset for help on using the changeset viewer.