source: icXML/icXML-devel/src/xercesc/util/regx/UnicodeRangeFactory.cpp @ 2722

Last change on this file since 2722 was 2722, checked in by cameron, 6 years ago

Original Xerces files with import mods for icxercesc

File size: 10.2 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/*
19 * $Id: UnicodeRangeFactory.cpp 678879 2008-07-22 20:05:05Z amassari $
20 */
21
22// ---------------------------------------------------------------------------
23//  Includes
24// ---------------------------------------------------------------------------
25#include <xercesc/util/regx/UnicodeRangeFactory.hpp>
26#include <xercesc/util/regx/TokenFactory.hpp>
27#include <xercesc/util/regx/RangeToken.hpp>
28#include <xercesc/util/regx/RangeTokenMap.hpp>
29#include <xercesc/util/regx/RegxDefs.hpp>
30#include <xercesc/util/regx/XMLUniCharacter.hpp>
31
32XERCES_CPP_NAMESPACE_BEGIN
33
34// ---------------------------------------------------------------------------
35//  Local data
36// ---------------------------------------------------------------------------
37
38const XMLCh uniCategNames[][3] =
39{
40    {chLatin_C, chLatin_n, chNull},     // UNASSIGNED
41    {chLatin_L, chLatin_u, chNull},     // UPPERCASE_LETTER
42    {chLatin_L, chLatin_l, chNull},     // LOWERCASE_LETTER
43    {chLatin_L, chLatin_t, chNull},     // TITLECASE_LETTER
44    {chLatin_L, chLatin_m, chNull},     // MODIFIER_LETTER
45    {chLatin_L, chLatin_o, chNull},     // OTHER_LETTER
46    {chLatin_M, chLatin_n, chNull},     // NON_SPACING_MARK
47    {chLatin_M, chLatin_e, chNull},     // ENCLOSING_MARK
48    {chLatin_M, chLatin_c, chNull},     // COMBINING_SPACING_MARK
49    {chLatin_N, chLatin_d, chNull},     // DECIMAL_DIGIT_NUMBER
50    {chLatin_N, chLatin_l, chNull},     // LETTER_NUMBER
51    {chLatin_N, chLatin_o, chNull},     // OTHER_NUMBER
52    {chLatin_Z, chLatin_s, chNull},     // SPACE_SEPARATOR
53    {chLatin_Z, chLatin_l, chNull},     // LINE_SEPARATOR
54    {chLatin_Z, chLatin_p, chNull},     // PARAGRAPH_SEPARATOR
55    {chLatin_C, chLatin_c, chNull},     // CONTROL
56    {chLatin_C, chLatin_f, chNull},     // FORMAT
57    {chLatin_C, chLatin_o, chNull},     // PRIVATE_USE
58    {chLatin_C, chLatin_s, chNull},     // SURROGATE
59    {chLatin_P, chLatin_d, chNull},     // DASH_PUNCTUATION
60    {chLatin_P, chLatin_s, chNull},     // START_PUNCTUATION
61    {chLatin_P, chLatin_e, chNull},     // END_PUNCTUATION
62    {chLatin_P, chLatin_c, chNull},     // CONNECTOR_PUNCTUATION
63    {chLatin_P, chLatin_o, chNull},     // OTHER_PUNCTUATION
64    {chLatin_S, chLatin_m, chNull},     // MATH_SYMBOL
65    {chLatin_S, chLatin_c, chNull},     // CURRENCY_SYMBOL
66    {chLatin_S, chLatin_k, chNull},     // MODIFIER_SYMBOL
67    {chLatin_S, chLatin_o, chNull},     // OTHER_SYMBOL
68    {chLatin_P, chLatin_i, chNull},     // INITIAL_PUNCTUATION
69    {chLatin_P, chLatin_f, chNull},     // FINAL_PUNCTUATION
70    {chLatin_L, chNull},                // CHAR_LETTER
71    {chLatin_M, chNull},                // CHAR_MARK
72    {chLatin_N, chNull},                // CHAR_NUMBER
73    {chLatin_Z, chNull},                // CHAR_SEPARATOR
74    {chLatin_C, chNull},                // CHAR_OTHER
75    {chLatin_P, chNull},                // CHAR_PUNCTUATION
76    {chLatin_S, chNull},                // CHAR_SYMBOL
77};
78
79// ---------------------------------------------------------------------------
80//  UnicodeRangeFactory: Constructors and Destructor
81// ---------------------------------------------------------------------------
82UnicodeRangeFactory::UnicodeRangeFactory()
83{
84}
85
86UnicodeRangeFactory::~UnicodeRangeFactory() {
87
88}
89
90// ---------------------------------------------------------------------------
91//  UnicodeRangeFactory: Range creation methods
92// ---------------------------------------------------------------------------
93void UnicodeRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) {
94
95    if (fRangesCreated)
96        return;
97
98    if (!fKeywordsInitialized) {
99        initializeKeywordMap(rangeTokMap);
100    }
101
102    TokenFactory* tokFactory = rangeTokMap->getTokenFactory();
103    RangeToken* ranges[UNICATEGSIZE];
104    RangeToken* tok;
105
106    for (int i=0; i < UNICATEGSIZE; i++) {
107        ranges[i] = tokFactory->createRange();
108    }
109
110    for (int j=0; j < 0x10000; j++) {
111
112        unsigned short charType = XMLUniCharacter::getType(j);
113
114        ranges[charType]->addRange(j, j);
115        charType = getUniCategory(charType);
116        ranges[charType]->addRange(j, j);
117    }
118
119    ranges[XMLUniCharacter::UNASSIGNED]->addRange(0x10000, Token::UTF16_MAX);
120
121    for (int k=0; k < UNICATEGSIZE; k++) {
122        tok = RangeToken::complementRanges(ranges[k], tokFactory);
123        // build the internal map.
124        tok->createMap();
125        rangeTokMap->setRangeToken(uniCategNames[k], ranges[k]);
126        rangeTokMap->setRangeToken(uniCategNames[k], tok , true);
127    }
128
129    // Create all range
130    tok = tokFactory->createRange();
131    tok->addRange(0, Token::UTF16_MAX);
132    // build the internal map.
133    tok->createMap();
134    rangeTokMap->setRangeToken(fgUniAll, tok);
135
136    // Create alpha range
137    tok = tokFactory->createRange();
138    tok->mergeRanges(ranges[XMLUniCharacter::UPPERCASE_LETTER]);
139    tok->mergeRanges(ranges[XMLUniCharacter::LOWERCASE_LETTER]);
140    tok->mergeRanges(ranges[XMLUniCharacter::OTHER_LETTER]);
141    // build the internal map.
142    tok->createMap();
143    rangeTokMap->setRangeToken(fgUniIsAlpha, tok);
144
145    // Create alpha-num range
146    RangeToken* alnumTok = tokFactory->createRange();
147    alnumTok->mergeRanges(tok);
148    alnumTok->mergeRanges(ranges[XMLUniCharacter::DECIMAL_DIGIT_NUMBER]);
149    // build the internal map.
150    alnumTok->createMap();
151    rangeTokMap->setRangeToken(fgUniIsAlnum, alnumTok);
152
153    // Create word range
154    tok = tokFactory->createRange();
155    tok->mergeRanges(alnumTok);
156    tok->addRange(chUnderscore, chUnderscore);
157    // build the internal map.
158    tok->createMap();
159    rangeTokMap->setRangeToken(fgUniIsWord, tok);
160
161    tok = RangeToken::complementRanges(tok, tokFactory);
162    // build the internal map.
163    tok->createMap();
164    rangeTokMap->setRangeToken(fgUniIsWord, tok , true);
165
166    // Create assigned range
167    tok = RangeToken::complementRanges(
168                ranges[XMLUniCharacter::UNASSIGNED],
169                tokFactory,
170                tokFactory->getMemoryManager());
171    // build the internal map.
172    tok->createMap();
173    rangeTokMap->setRangeToken(fgUniAssigned,tok);
174
175    // Create space range
176    tok = tokFactory->createRange();
177    tok->mergeRanges(ranges[XMLUniCharacter::SPACE_SEPARATOR]);
178    tok->mergeRanges(ranges[XMLUniCharacter::LINE_SEPARATOR]);
179    //tok->mergeRanges(ranges[XMLUniCharacter::PARAGRAPH_SEPARATOR]);
180    // build the internal map.
181    tok->createMap();
182    rangeTokMap->setRangeToken(fgUniIsSpace, tok);
183
184    tok = RangeToken::complementRanges(tok, tokFactory);
185    // build the internal map.
186    tok->createMap();
187    rangeTokMap->setRangeToken(fgUniIsSpace, tok , true);
188
189    RangeToken* const dummyToken =
190        tokFactory->createRange();
191
192    dummyToken->addRange(-1, -2);
193    dummyToken->createMap();
194
195    // build the internal maps.
196    for (int l=0; l < UNICATEGSIZE; l++) {
197        ranges[l]->createMap();
198        ranges[l]->setCaseInsensitiveToken(dummyToken);
199    }
200
201    fRangesCreated = true;
202}
203
204// ---------------------------------------------------------------------------
205//  UnicodeRangeFactory: Initialization methods
206// ---------------------------------------------------------------------------
207void UnicodeRangeFactory::initializeKeywordMap(RangeTokenMap *rangeTokMap) {
208
209    if (fKeywordsInitialized)
210        return;
211
212    for (int k=0; k < UNICATEGSIZE; k++) {
213        rangeTokMap->addKeywordMap(uniCategNames[k], fgUnicodeCategory);
214    }
215
216    rangeTokMap->addKeywordMap(fgUniAll, fgUnicodeCategory);
217    rangeTokMap->addKeywordMap(fgUniIsAlpha, fgUnicodeCategory);
218    rangeTokMap->addKeywordMap(fgUniIsAlnum, fgUnicodeCategory);
219    rangeTokMap->addKeywordMap(fgUniIsWord, fgUnicodeCategory);
220    rangeTokMap->addKeywordMap(fgUniAssigned, fgUnicodeCategory);
221    rangeTokMap->addKeywordMap(fgUniIsSpace, fgUnicodeCategory);
222
223    fKeywordsInitialized = true;
224}
225
226// ---------------------------------------------------------------------------
227//  UnicodeRangeFactory: Helper methods
228// ---------------------------------------------------------------------------
229unsigned short UnicodeRangeFactory::getUniCategory(const unsigned short type)
230{
231    switch(type) {
232    case XMLUniCharacter::UPPERCASE_LETTER:
233    case XMLUniCharacter::LOWERCASE_LETTER:
234    case XMLUniCharacter::TITLECASE_LETTER:
235    case XMLUniCharacter::MODIFIER_LETTER:
236    case XMLUniCharacter::OTHER_LETTER:
237        return CHAR_LETTER;
238    case XMLUniCharacter::NON_SPACING_MARK:
239    case XMLUniCharacter::COMBINING_SPACING_MARK:
240    case XMLUniCharacter::ENCLOSING_MARK:
241        return CHAR_MARK;
242    case XMLUniCharacter::DECIMAL_DIGIT_NUMBER:
243    case XMLUniCharacter::LETTER_NUMBER:
244    case XMLUniCharacter::OTHER_NUMBER:
245        return CHAR_NUMBER;
246    case XMLUniCharacter::SPACE_SEPARATOR:
247    case XMLUniCharacter::LINE_SEPARATOR:
248    case XMLUniCharacter::PARAGRAPH_SEPARATOR:
249        return CHAR_SEPARATOR;
250    case XMLUniCharacter::CONTROL:
251    case XMLUniCharacter::FORMAT:
252    case XMLUniCharacter::SURROGATE:
253    case XMLUniCharacter::PRIVATE_USE:
254    case XMLUniCharacter::UNASSIGNED:
255        return CHAR_OTHER;
256    case XMLUniCharacter::CONNECTOR_PUNCTUATION:
257    case XMLUniCharacter::DASH_PUNCTUATION:
258    case XMLUniCharacter::START_PUNCTUATION:
259    case XMLUniCharacter::END_PUNCTUATION:
260    case XMLUniCharacter::OTHER_PUNCTUATION:
261    case XMLUniCharacter::INITIAL_PUNCTUATION:
262    case XMLUniCharacter::FINAL_PUNCTUATION:
263        return CHAR_PUNCTUATION;
264    case XMLUniCharacter::MATH_SYMBOL:
265    case XMLUniCharacter::CURRENCY_SYMBOL:
266    case XMLUniCharacter::MODIFIER_SYMBOL:
267    case XMLUniCharacter::OTHER_SYMBOL:
268        return CHAR_SYMBOL;
269    }
270
271    return 0;
272}
273
274XERCES_CPP_NAMESPACE_END
275
276/**
277  * End of file UnicodeRangeFactory.cpp
278  */
Note: See TracBrowser for help on using the repository browser.