1 | /* |
---|
2 | * Copyright (c) 2016 International Characters. |
---|
3 | * This software is licensed to the public under the Open Software License 3.0. |
---|
4 | * icgrep is a trademark of International Characters. |
---|
5 | */ |
---|
6 | |
---|
7 | #include <re/re_parser_prosite.h> |
---|
8 | #include <re/re_parser_helper.h> |
---|
9 | #include <re/re_alt.h> |
---|
10 | #include <re/re_any.h> |
---|
11 | #include <re/re_seq.h> |
---|
12 | #include <re/re_start.h> |
---|
13 | #include <re/re_end.h> |
---|
14 | #include <re/re_diff.h> |
---|
15 | #include <re/re_rep.h> |
---|
16 | |
---|
17 | namespace re{ |
---|
18 | |
---|
19 | RE * RE_Parser_PROSITE::parse_RE() { |
---|
20 | return parse_seq(); |
---|
21 | } |
---|
22 | |
---|
23 | RE * RE_Parser_PROSITE::parse_seq() { |
---|
24 | std::vector<RE *> seq; |
---|
25 | for (;;) { |
---|
26 | RE * re = parse_next_item(); |
---|
27 | if (re == nullptr) { |
---|
28 | break; |
---|
29 | } |
---|
30 | re = extend_item(re); |
---|
31 | seq.push_back(re); |
---|
32 | } |
---|
33 | return makeSeq(seq.begin(), seq.end()); |
---|
34 | } |
---|
35 | |
---|
36 | RE * RE_Parser_PROSITE::parse_next_item() { |
---|
37 | RE * re = nullptr; |
---|
38 | if (mCursor.more()) { |
---|
39 | if (*mCursor == '-') { |
---|
40 | mCursor++; |
---|
41 | } |
---|
42 | switch (*mCursor) { |
---|
43 | case ']': case '}': |
---|
44 | ParseFailure("Illegal Input"); |
---|
45 | case '<': { // the N-terminal of the sequence ('<') |
---|
46 | mCursor++; |
---|
47 | return makeStart(); |
---|
48 | } |
---|
49 | case '>': { // the C-terminal of the sequence ('>') |
---|
50 | mCursor++; |
---|
51 | return makeEnd(); |
---|
52 | } |
---|
53 | case '[': { // Ambiguities are indicated by listing between '[ ]' the acceptable amino acids for a given position. |
---|
54 | mCursor++; |
---|
55 | return parse_prosite_alt(); |
---|
56 | } |
---|
57 | case '{': { // Ambiguities are also indicated by listing between '{ }' the amino acids that are not accepted at a given position. |
---|
58 | mCursor++; |
---|
59 | RE * re_temp = parse_prosite_not(); |
---|
60 | return makeDiff(makeAny(), re_temp); |
---|
61 | } |
---|
62 | case 'x': // the 'any' metacharacter |
---|
63 | mCursor++; |
---|
64 | return makeAny(); |
---|
65 | case '.': // ends the pattern |
---|
66 | break; |
---|
67 | default: |
---|
68 | re = createCC(parse_utf8_codepoint()); |
---|
69 | return re; |
---|
70 | } |
---|
71 | } |
---|
72 | return nullptr; |
---|
73 | } |
---|
74 | |
---|
75 | RE * RE_Parser_PROSITE::parse_prosite_alt() { |
---|
76 | std::vector<RE *> alt; |
---|
77 | while (*mCursor != ']') { |
---|
78 | RE * re = nullptr; |
---|
79 | if (*mCursor == '>') { |
---|
80 | re = makeEnd(); |
---|
81 | mCursor++; |
---|
82 | } else { |
---|
83 | re = createCC(parse_utf8_codepoint()); |
---|
84 | } |
---|
85 | alt.push_back(re); |
---|
86 | } |
---|
87 | mCursor++; |
---|
88 | return makeAlt(alt.begin(), alt.end()); |
---|
89 | } |
---|
90 | |
---|
91 | RE * RE_Parser_PROSITE::parse_prosite_not() { |
---|
92 | std::vector<RE *> alt; |
---|
93 | while (*mCursor != '}') { |
---|
94 | RE * re = createCC(parse_utf8_codepoint()); |
---|
95 | alt.push_back(re); |
---|
96 | } |
---|
97 | mCursor++; |
---|
98 | return makeAlt(alt.begin(), alt.end()); |
---|
99 | } |
---|
100 | |
---|
101 | RE * RE_Parser_PROSITE::extend_item(RE * re) { |
---|
102 | if (LLVM_LIKELY(mCursor.more())) { |
---|
103 | if (*mCursor == '(') { |
---|
104 | int lb = 0, ub = 0; |
---|
105 | std::tie(lb, ub) = parse_range_bound(); |
---|
106 | if ((ub != Rep::UNBOUNDED_REP) && (lb > ub)) { |
---|
107 | ParseFailure("Lower bound cannot exceed upper bound in bounded repetition"); |
---|
108 | } |
---|
109 | ++mCursor; |
---|
110 | re = makeRep(re, lb, ub); |
---|
111 | } |
---|
112 | |
---|
113 | } |
---|
114 | return re; |
---|
115 | } |
---|
116 | |
---|
117 | std::pair<int, int> RE_Parser_PROSITE::parse_range_bound() { |
---|
118 | int lower_bound = 0, upper_bound = 0; |
---|
119 | mCursor++; |
---|
120 | lower_bound = RE_Parser::parse_int(); |
---|
121 | if (*mCursor == ')') { |
---|
122 | upper_bound = lower_bound; |
---|
123 | } else if (*mCursor != ',') { |
---|
124 | ParseFailure("Bad lower bound!"); |
---|
125 | } else { |
---|
126 | mCursor++; |
---|
127 | upper_bound = parse_int(); |
---|
128 | if (*mCursor != ')') { |
---|
129 | ParseFailure("Bad upper bound!"); |
---|
130 | } |
---|
131 | } |
---|
132 | return std::make_pair(lower_bound, upper_bound); |
---|
133 | } |
---|
134 | |
---|
135 | } |
---|