source: icGREP/icgrep-devel/icgrep/re/re_parser.h @ 4809

Last change on this file since 4809 was 4809, checked in by nmedfort, 4 years ago

Refactored UCD property resolution.

File size: 3.0 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef RE_PARSER_H
8#define RE_PARSER_H
9
10#include <re/re_re.h>
11#include <re/re_any.h>
12#include <re/re_name.h>
13#include <UCD/resolve_properties.h>
14#include <string>
15#include <list>
16#include <memory>
17#include <map>
18
19
20namespace re {
21
22enum CharsetOperatorKind
23    {intersectOp, setDiffOp, ampChar, hyphenChar, rangeHyphen, posixPropertyOpener, setOpener, setCloser, backSlash, emptyOperator};
24
25enum ModeFlagType
26    {CASE_INSENSITIVE_MODE_FLAG = 1,
27     MULTILINE_MODE_FLAG = 2,      // not currently implemented
28     DOTALL_MODE_FLAG = 4,         // not currently implemented
29     IGNORE_SPACE_MODE_FLAG = 8,   // not currently implemented
30     UNIX_LINES_MODE_FLAG = 16};   // not currently implemented
31
32const int MAX_REPETITION_LOWER_BOUND = 1024;
33const int MAX_REPETITION_UPPER_BOUND = 2048;
34
35typedef unsigned ModeFlagSet;
36
37class RE_Parser
38{
39public:
40
41    static RE * parse(const std::string &input_string, ModeFlagSet initialFlags);
42
43private:
44
45    using NameMap = std::map<std::pair<std::string, std::string>, re::Name *>;
46
47    typedef std::string::const_iterator cursor_t;
48
49    RE_Parser(const std::string & regular_expression);
50
51    RE_Parser(const std::string & regular_expression, ModeFlagSet initialFlags);
52
53    RE * parse_RE();
54
55    RE * parse_alt();
56
57    RE * parse_seq();
58
59    RE * parse_next_item();
60
61    RE * parse_group();
62
63    RE * extend_item(RE * re);
64
65    void parse_range_bound(int & lo_codepoint, int & hi_codepoint);
66
67    unsigned parse_int();
68
69    RE * parse_escaped();
70
71    RE * parseEscapedSet();
72
73    codepoint_t parse_utf8_codepoint();
74
75    Name * parsePropertyExpression();
76
77    CC * parseNamePatternExpression();
78
79    RE * makeComplement(RE * s);
80    RE * makeWordBoundary();
81    RE * makeWordNonBoundary();
82    Name * makeDigitSet();
83    Name * makeAlphaNumeric();
84    Name * makeWhitespaceSet();
85    Name * makeWordSet();
86
87    Name * createName(const std::string value);
88    Name * createName(const std::string prop, const std::string value);
89
90    CharsetOperatorKind getCharsetOperator();
91
92    RE * parse_charset();
93
94    codepoint_t parse_codepoint();
95
96    codepoint_t parse_escaped_codepoint();
97
98    codepoint_t parse_hex_codepoint(int mindigits, int maxdigits);
99
100    codepoint_t parse_octal_codepoint(int mindigits, int maxdigits);
101
102    inline void throw_incomplete_expression_error_if_end_of_stream() const;
103
104    // CC insertion dependent on case-insensitive flag.
105    CC * build_CC(codepoint_t cp);
106
107    void CC_add_codepoint(CC * cc, codepoint_t cp);
108
109    void CC_add_range(CC * cc, codepoint_t lo, codepoint_t hi);
110
111    static std::string canonicalize(const cursor_t begin, const cursor_t end);
112
113private:
114
115    cursor_t                    _cursor;
116    const cursor_t              _end;
117    ModeFlagSet                 fModeFlagSet;
118    bool                        fNested;
119    NameMap                     mNameMap;
120};
121
122}
123
124#endif // RE_PARSER_H
Note: See TracBrowser for help on using the repository browser.