source: icGREP/icgrep-devel/icgrep/re/re_parser.h @ 4671

Last change on this file since 4671 was 4671, checked in by nmedfort, 4 years ago

Moved responsibility of handling 'special cases of Unicode TR #18' and 'compatibility properties of UTR #18 Annex C' into RE_Parser.

File size: 2.9 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef RE_PARSER_H
8#define RE_PARSER_H
9
10#include "re_re.h"
11#include "re_any.h"
12#include "re_name.h"
13
14#include <string>
15#include <list>
16#include <memory>
17#include <map>
18
19namespace re {
20       
21enum CharsetOperatorKind
22        {intersectOp, setDiffOp, ampChar, hyphenChar, rangeHyphen, posixPropertyOpener, setOpener, setCloser, backSlash, emptyOperator};
23
24enum ModeFlagType
25    {CASE_INSENSITIVE_MODE_FLAG = 1,
26     MULTILINE_MODE_FLAG = 2,      // not currently implemented
27     DOTALL_MODE_FLAG = 4,         // not currently implemented
28     IGNORE_SPACE_MODE_FLAG = 8,   // not currently implemented
29     UNIX_LINES_MODE_FLAG = 16};   // not currently implemented
30   
31const int MAX_REPETITION_LOWER_BOUND = 1024;
32const int MAX_REPETITION_UPPER_BOUND = 2048;
33
34typedef unsigned ModeFlagSet;
35   
36class RE_Parser
37{
38public:
39
40    static RE * parse(const std::string &input_string, ModeFlagSet initialFlags);
41
42private:
43
44    using NameMap = std::map<std::pair<std::string, std::string>, re::Name *>;
45
46    typedef std::string::const_iterator cursor_t;
47
48    RE_Parser(const std::string & regular_expression);
49   
50    RE_Parser(const std::string & regular_expression, ModeFlagSet initialFlags);
51
52    RE * parse_RE();
53   
54    RE * parse_alt();
55   
56    RE * parse_seq();
57
58    RE * parse_next_item();
59   
60    RE * parse_group();
61   
62    RE * extend_item(RE * re);
63
64    void parse_range_bound(int & lo_codepoint, int & hi_codepoint);
65
66    unsigned parse_int();
67   
68    RE * parse_escaped();
69
70    RE * parseEscapedSet();
71
72    codepoint_t parse_utf8_codepoint();
73
74    Name * parsePropertyExpression();
75       
76    RE * makeComplement(RE * s);
77    RE * makeWordBoundary ();
78    RE * makeWordNonBoundary ();
79    Name * makeDigitSet();
80    Name * makeAlphaNumeric();
81    Name * makeWhitespaceSet();
82    Name * makeWordSet();
83    Name * resolvePropertyExpression(std::string nameValue);
84
85    Name * resolvePropertyExpression(std::string namespaceValue, std::string nameValue);
86
87        CharsetOperatorKind getCharsetOperator();
88
89    RE * parse_charset();
90
91    codepoint_t parse_codepoint();
92
93    codepoint_t parse_escaped_codepoint();
94
95    codepoint_t parse_hex_codepoint(int mindigits, int maxdigits);
96
97    codepoint_t parse_octal_codepoint(int mindigits, int maxdigits);
98
99    inline void throw_incomplete_expression_error_if_end_of_stream() const;
100   
101    // CC insertion dependent on case-insensitive flag.
102    CC * build_CC(codepoint_t cp);
103   
104    void CC_add_codepoint(CC * cc, codepoint_t cp);
105   
106    void CC_add_range(CC * cc, codepoint_t lo, codepoint_t hi);
107
108    static std::string canonicalize(const cursor_t begin, const cursor_t end);
109
110private:
111
112    cursor_t                    _cursor;
113    const cursor_t              _end;
114    ModeFlagSet                 fModeFlagSet;
115    NameMap                     mNameMap;
116};
117
118}
119
120#endif // RE_PARSER_H
Note: See TracBrowser for help on using the repository browser.