Changeset 3935


Ignore:
Timestamp:
Jul 31, 2014, 11:07:49 AM (5 years ago)
Author:
daled
Message:

The parser is now able to parse unicode categories.

Location:
icGREP/icgrep-devel/icgrep
Files:
18 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/llvm_gen.cpp

    r3914 r3935  
    2424    m_lf_ccname = lf_ccname;
    2525    mBits = bits;
    26     mInWhile = false;
    2726}
    2827
  • icGREP/icgrep-devel/icgrep/llvm_gen.h

    r3914 r3935  
    131131    std::map<std::string, Value*> mMarkerMap;
    132132
    133     bool        mInWhile;
    134 
    135133    int         mCarryQueueIdx;
    136134    Value*      mptr_carry_q;
  • icGREP/icgrep-devel/icgrep/re_cc.cpp

    r3914 r3935  
    2323{
    2424    gensym_name();
    25     insert_range(lo_codepoint, hi_codepoint);
    26 }
    27 
    28 CC::CC(std::string name, int codepoint)
    29 {
    30     mName = name;
    31     insert1(codepoint);
    32 }
    33 
    34 CC::CC(std::string name, int lo_codepoint, int hi_codepoint)
    35 {
    36     mName = name;
    3725    insert_range(lo_codepoint, hi_codepoint);
    3826}
  • icGREP/icgrep-devel/icgrep/re_cc.h

    r3914 r3935  
    2929    CC(int codepoint);
    3030    CC(int lo_codepoint, int hi_codepoint);
    31     CC(std::string name, int codepoint);
    32     CC(std::string name, int lo_codepoint, int hi_codepoint);
    3331    CC(CC* cc1, CC* cc2);
    3432    ~CC();
     
    5755
    5856    std::vector<CharSetItem> mSparceCharSet;
    59     std::string mName;
    6057    std::string mId;
    6158};
  • icGREP/icgrep-devel/icgrep/re_compiler.cpp

    r3914 r3935  
    3636
    3737    //Print to the terminal the AST that was generated by the parser before adding the UTF encoding:
    38     //std::cout << "\nParser:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
     38    std::cout << "\nParser:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
    3939
    4040    //Add the UTF encoding.
     
    5353
    5454    //Print to the terminal the AST that was generated by the utf8 encoder.
    55     //std::cout << "\nUTF8-encoder:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
     55    std::cout << "\nUTF8-encoder:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
    5656
    5757    //Optimization passes to simplify the AST.
     
    5959
    6060    //Print to the terminal the AST that was generated by the simplifier.
    61     //std::cout << "\nSimplifier:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
     61    std::cout << "\nSimplifier:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
    6262
    6363    //Map all of the unique character classes in order to reduce redundancy.
     
    6666
    6767    //Print to the terminal the AST with the reduced REs.
    68     //std::cout << "\nReducer:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
     68    std::cout << "\nReducer:\n" + Printer_RE::PrintRE(re_ast) + "\n" << std::endl;
    6969
    7070    //Build our list of predefined characters.
    7171    std::list<CC*> predefined_characters;
    72     CC* cc_lf = new CC("lex.cclf", '\n');
     72    CC* cc_lf = new CC('\n');
    7373    std::string lf_ccname = cc_lf->getName();
    7474    re_map.insert(make_pair(lf_ccname, cc_lf));
  • icGREP/icgrep-devel/icgrep/re_name.cpp

    r3917 r3935  
    11#include "re_name.h"
     2
     3Name::Name()
     4{
     5    mName = "";
     6}
    27
    38Name::Name(std::string name)
     
    813Name::~Name(){}
    914
     15void Name::setName(std::string name)
     16{
     17    mName = name;
     18}
     19
    1020std::string Name::getName()
    1121{
  • icGREP/icgrep-devel/icgrep/re_name.h

    r3917 r3935  
    99{
    1010public:
     11    Name();
    1112    Name(std::string name);
     13    void setName(std::string name);
    1214    std::string getName();
    1315    ~Name();
  • icGREP/icgrep-devel/icgrep/re_nullable.cpp

    r3917 r3935  
    1818        std::list<RE*>* t1_list = new std::list<RE*>();
    1919        t1_list->assign(re_list->begin(), re_list->end());
    20 
    21         return new Seq(removeNullableSeqPrefix(t1_list));
     20        Seq* new_seq = new Seq(removeNullableSeqPrefix(t1_list));
     21        new_seq->setType(re_seq->getType());
     22
     23        return new_seq;
    2224    }
    2325    else if (Alt* re_alt = dynamic_cast<Alt*>(re))
     
    4648            seq_lst->push_back(removeNullablePrefix(re_rep->getRE()));
    4749
    48             return RE_Simplifier::mkSeq(seq_lst);
     50            return RE_Simplifier::mkSeq(Seq::Normal, seq_lst);
    4951        }
    5052        else
     
    8991        std::list<RE*>* t1_list = new std::list<RE*>();
    9092        t1_list->assign(re_list->begin(), re_list->end());
    91 
    92         return new Seq(removeNullableSeqSuffix(t1_list));
     93        Seq* new_seq = new Seq(removeNullableSeqSuffix(t1_list));
     94        new_seq->setType(re_seq->getType());
     95        return new_seq;
    9396    }
    9497    else if (Alt* re_alt = dynamic_cast<Alt*>(re))
     
    118121            seq_lst->push_back(new Rep(re_rep->getRE(), re_rep->getLB()-1, re_rep->getLB()-1));
    119122
    120             return RE_Simplifier::mkSeq(seq_lst);
     123            return RE_Simplifier::mkSeq(Seq::Normal, seq_lst);
    121124        }
    122125        else
  • icGREP/icgrep-devel/icgrep/re_parser.cpp

    r3934 r3935  
    329329            return cc_retVal;
    330330        }
     331        else if (s.operator [](1) == 'p')
     332        {
     333            return cc_retVal = parse_unicode_category(s.substr(2, s.length() - 2));
     334        }
    331335        else
    332336        {
     
    441445}
    442446
     447parse_result_retVal RE_Parser::parse_unicode_category(std::string s)
     448{
     449    parse_result_retVal result_retVal;
     450
     451    if (s.operator [](0) == '{')
     452    {
     453        Name* name = new Name();
     454        result_retVal = parse_unicode_category1(s.substr(1,1), s.substr(2, s.length() - 2), name);
     455    }
     456    else
     457    {
     458        result_retVal.result = new ParseFailure("Incorrect Unicode character class format!");
     459        result_retVal.remaining = "";
     460    }
     461
     462    return result_retVal;
     463}
     464
     465parse_result_retVal RE_Parser::parse_unicode_category1(std::string character, std::string s, Name* name_sofar)
     466{
     467    parse_result_retVal unicode_cat1_retVal;
     468
     469    if (s.length() == 0)
     470    {
     471        delete name_sofar;
     472        unicode_cat1_retVal.result = new ParseFailure("Unclosed Unicode character class!");
     473        unicode_cat1_retVal.remaining = "";
     474    }
     475    else if (s.operator [](0) == '}')
     476    {
     477        name_sofar->setName(name_sofar->getName() + character);
     478        if (isValidUnicodeCategoryName(name_sofar))
     479        {
     480            unicode_cat1_retVal.result = new ParseSuccess(name_sofar);
     481            unicode_cat1_retVal.remaining = s.substr(1, s.length() - 1);
     482        }
     483        else
     484        {
     485            unicode_cat1_retVal.result = new ParseFailure("Unknown Unicode character class!");
     486            unicode_cat1_retVal.remaining = s.substr(1, s.length() - 1);
     487        }
     488    }
     489    else
     490    {
     491        name_sofar->setName(name_sofar->getName() + character);
     492        unicode_cat1_retVal = parse_unicode_category1(s.substr(0,1), s.substr(1, s.length() - 1), name_sofar);
     493    }
     494
     495    return unicode_cat1_retVal;
     496}
     497
    443498parse_result_retVal RE_Parser::parse_cc_body(std::string s)
    444499{
     
    573628        int hexval_sofar = 0;
    574629        int_retVal = parse_hex_body(hexval_sofar, s.substr(1, s.length() - 1));
    575 
    576630    }
    577631    else
     
    688742}
    689743
    690 
    691 
    692 
    693 
    694 
    695 
    696 
    697 
    698 
    699 
    700 
    701 
    702 
    703 
    704 
    705 
    706 
    707 
    708 
    709 
    710 
    711 
     744bool RE_Parser::isValidUnicodeCategoryName(Name* name)
     745{
     746    std::string cat_name = name->getName();
     747
     748    if (cat_name == "Cc")
     749        return true;
     750    else if (cat_name == "Cf")
     751        return true;
     752    else if (cat_name == "Cn")
     753        return true;
     754    else if (cat_name == "Co")
     755        return true;
     756    else if (cat_name == "Cs")
     757        return true;
     758    else if (cat_name == "C")
     759        return true;
     760    else if (cat_name == "Ll")
     761        return true;
     762    else if (cat_name == "Lt")
     763        return true;
     764    else if (cat_name == "Lu")
     765        return true;
     766    else if (cat_name == "L&")
     767        return true;
     768    else if (cat_name == "Lc")
     769        return true;
     770    else if (cat_name == "Lm")
     771        return true;
     772    else if (cat_name == "Lo")
     773        return true;
     774    else if (cat_name == "L")
     775        return true;
     776    else if (cat_name == "Mc")
     777        return true;
     778    else if (cat_name == "Me")
     779        return true;
     780    else if (cat_name == "Mn")
     781        return true;
     782    else if (cat_name == "M")
     783        return true;
     784    else if (cat_name == "Nd")
     785        return true;
     786    else if (cat_name == "Nl")
     787        return true;
     788    else if (cat_name == "No")
     789        return true;
     790    else if (cat_name == "N")
     791        return true;
     792    else if (cat_name == "Pc")
     793        return true;
     794    else if (cat_name == "Pd")
     795        return true;
     796    else if (cat_name == "Pe")
     797        return true;
     798    else if (cat_name == "Pf")
     799        return true;
     800    else if (cat_name == "Pi")
     801        return true;
     802    else if (cat_name == "Po")
     803        return true;
     804    else if (cat_name == "Ps")
     805        return true;
     806    else if (cat_name == "P")
     807        return true;
     808    else if (cat_name == "Sc")
     809        return true;
     810    else if (cat_name == "Sk")
     811        return true;
     812    else if (cat_name == "Sm")
     813        return true;
     814    else if (cat_name == "So")
     815        return true;
     816    else if (cat_name == "S")
     817        return true;
     818    else if (cat_name == "Zl")
     819        return true;
     820    else if (cat_name == "Zp")
     821        return true;
     822    else if (cat_name == "Zs")
     823        return true;
     824    else if (cat_name == "Z")
     825        return true;
     826    else
     827        return false;
     828}
     829
     830
     831
     832
     833
     834
     835
     836
     837
     838
     839
     840
     841
     842
     843
     844
     845
     846
     847
     848
     849
  • icGREP/icgrep-devel/icgrep/re_parser.h

    r3934 r3935  
    6464    static parse_result_retVal parse_utf8_suffix_byte(int suffix_byte_num, std::string s, Seq* seq_sofar);
    6565
     66    static parse_result_retVal parse_unicode_category(std::string s);
     67    static parse_result_retVal parse_unicode_category1(std::string character, std::string s, Name* name_sofar);
     68    static bool isValidUnicodeCategoryName(Name* name);
     69
    6670    static parse_int_retVal parse_hex(std::string s);
    6771    static parse_int_retVal parse_hex_body(int i, std::string s);
  • icGREP/icgrep-devel/icgrep/re_reducer.cpp

    r3917 r3935  
    2020    else if (Seq* re_seq = dynamic_cast<Seq*>(re))
    2121    {
    22         std::list<RE*> re_list;
    23         std::list<RE*>::iterator it;
     22/*
     23        if (re_seq->getType() == Seq::Byte)
     24        {
     25            //If this is a sequence of byte classes then this is a multibyte sequence for a Unicode character class.
     26            std::string seqname = re_seq->getName();
     27            re_map.insert(make_pair(seqname, re_seq));
     28            retVal = new Name(seqname);
     29        }
     30        else
     31        {
     32*/
     33            std::list<RE*> re_list;
     34            std::list<RE*>::iterator it;
    2435
    25         for (it = re_seq->GetREList()->begin(); it != re_seq->GetREList()->end(); ++it)
    26         {
    27             re_list.push_front(reduce(*it, re_map));
    28         }
     36            for (it = re_seq->GetREList()->begin(); it != re_seq->GetREList()->end(); ++it)
     37            {
     38                re_list.push_front(reduce(*it, re_map));
     39            }
    2940
    30         retVal = new Seq(&re_list);
     41            retVal = new Seq(&re_list);
     42//        }
    3143    }
    3244    else if (Rep* re_rep = dynamic_cast<Rep*>(re))
     
    4254        retVal = new Name(ccname);
    4355    }
     56    else if (Name* re_name = dynamic_cast<Name*>(re))
     57    {
     58        retVal = new Name(re_name->getName());
     59    }
    4460    else if (Start* re_start = dynamic_cast<Start*>(re))
    4561    {
  • icGREP/icgrep-devel/icgrep/re_reducer.h

    r3917 r3935  
    2323
    2424#endif // RE_REDUCER_H
    25 
    26 /*
    27 
    28 #ifndef RE_SIMPLIFIER_H
    29 #define RE_SIMPLIFIER_H
    30 
    31 //Regular Expressions
    32 #include "re_re.h"
    33 #include "re_cc.h"
    34 #include "re_start.h"
    35 #include "re_end.h"
    36 #include "re_seq.h"
    37 #include "re_alt.h"
    38 #include "re_rep.h"
    39 
    40 #include <algorithm>
    41 #include <list>
    42 
    43 class RE_Simplifier
    44 {
    45 public:
    46     static RE* mkSeq(std::list<RE*>* re_list);
    47     static RE* mkRep(RE* re, int lb2, int ub2);
    48     static RE* mkAlt(std::list<RE*>* re_list);
    49     static RE* simplify(RE* re);
    50 private:
    51     static std::list<RE*>* mkSeqList(std::list<RE*>* re_list);
    52     static std::list<RE*>* mkSeqList_helper(std::list<RE*>* ret_list, std::list<RE*>* re_list);
    53     static std::list<RE*>* mkAltList(std::list<RE*>* re_list);
    54     static std::list<RE*>* mkAltList_helper(std::list<RE*>* ret_list, std::list<RE*>* re_list);
    55     static int ubCombine(int h1, int h2);
    56 };
    57 
    58 #endif // RE_SIMPLIFIER_H
    59 
    60 */
  • icGREP/icgrep-devel/icgrep/re_seq.cpp

    r3934 r3935  
    3939}
    4040
     41std::string Seq::getName()
     42{
     43    if (mType == Seq::Byte)
     44    {
     45        std::string name = "Seq";
     46
     47        std::list<RE*> re_list;
     48        std::list<RE*>::iterator it = mList->begin();
     49
     50        for (it = mList->begin(); it != mList->end(); ++it)
     51        {
     52            if (CC* seq_cc = dynamic_cast<CC*>(*it))
     53            {
     54                name += seq_cc->getName();
     55            }
     56            else
     57            {
     58                return "Bad Byte Sequence!";
     59            }
     60        }
     61
     62        return name;
     63    }
     64    else
     65    {
     66        return "Unnamed Sequence";
     67    }
     68}
     69
    4170std::list<RE*>* Seq::GetREList()
    4271{
  • icGREP/icgrep-devel/icgrep/re_seq.h

    r3934 r3935  
    99
    1010#include "re_re.h"
     11#include "re_cc.h"
    1112#include <list>
     13#include <sstream>
     14#include <utility>
    1215
    1316class Seq : public RE
     
    2124    std::list<RE*>* GetREList();
    2225    void AddREListItem(RE *re);
     26    std::string getName();
    2327    Type getType();
    2428    void setType(Type type);
  • icGREP/icgrep-devel/icgrep/re_simplifier.cpp

    r3917 r3935  
    2828        }
    2929
    30         retVal = mkSeq(&re_list);
     30        retVal = mkSeq(re_seq->getType(), &re_list);
    3131    }
    3232    else if (CC* re_cc = dynamic_cast<CC*>(re))
     
    3434        retVal = re_cc;
    3535    }
     36    else if (Name* re_name = dynamic_cast<Name*>(re))
     37    {
     38        retVal = new Name(re_name->getName());
     39    }
    3640    else if (Rep* re_rep = dynamic_cast<Rep*>(re))
    3741    {
     
    5054}
    5155
    52 RE* RE_Simplifier::mkSeq(std::list<RE*>* re_list)
     56RE* RE_Simplifier::mkSeq(Seq::Type type, std::list<RE*>* re_list)
    5357{
    5458    /*
     
    6670        if (t2_list->size() > 1)
    6771        {
    68             return new Seq(t2_list);
     72            Seq* new_seq = new Seq(t2_list);
     73            new_seq->setType(type);
     74            return new_seq;
    6975        }
    7076        else
  • icGREP/icgrep-devel/icgrep/re_simplifier.h

    r3917 r3935  
    55#include "re_re.h"
    66#include "re_cc.h"
     7#include "re_name.h"
    78#include "re_start.h"
    89#include "re_end.h"
     
    1718{
    1819public:
    19     static RE* mkSeq(std::list<RE*>* re_list);
     20    static RE* mkSeq(Seq::Type type, std::list<RE*>* re_list);
    2021    static RE* mkRep(RE* re, int lb2, int ub2);
    2122    static RE* mkAlt(std::list<RE*>* re_list);
  • icGREP/icgrep-devel/icgrep/utf8_encoder.cpp

    r3934 r3935  
    7272        }
    7373    }
     74    else if (Name* re_name = dynamic_cast<Name*>(re))
     75    {
     76        retVal = new Name(re_name->getName());
     77    }
    7478    else if (Start* re_start = dynamic_cast<Start*>(re))
    7579    {
  • icGREP/icgrep-devel/icgrep/utf8_encoder.h

    r3914 r3935  
    1111#include "re_re.h"
    1212#include "re_cc.h"
     13#include "re_name.h"
    1314#include "re_start.h"
    1415#include "re_end.h"
Note: See TracChangeset for help on using the changeset viewer.