1 | /* |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one or more |
---|
3 | * contributor license agreements. See the NOTICE file distributed with |
---|
4 | * this work for additional information regarding copyright ownership. |
---|
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
6 | * (the "License"); you may not use this file except in compliance with |
---|
7 | * the License. You may obtain a copy of the License at |
---|
8 | * |
---|
9 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
10 | * |
---|
11 | * Unless required by applicable law or agreed to in writing, software |
---|
12 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
14 | * See the License for the specific language governing permissions and |
---|
15 | * limitations under the License. |
---|
16 | */ |
---|
17 | |
---|
18 | /** |
---|
19 | * $Id: XMLRecognizer.cpp 555320 2007-07-11 16:05:13Z amassari $ |
---|
20 | */ |
---|
21 | |
---|
22 | |
---|
23 | // --------------------------------------------------------------------------- |
---|
24 | // Includes |
---|
25 | // --------------------------------------------------------------------------- |
---|
26 | #include <icxercesc/framework/XMLRecognizer.hpp> |
---|
27 | #include <xercesc/util/RuntimeException.hpp> |
---|
28 | #include <icxercesc/util/XMLString.hpp> |
---|
29 | |
---|
30 | XERCES_CPP_NAMESPACE_BEGIN |
---|
31 | |
---|
32 | // --------------------------------------------------------------------------- |
---|
33 | // Local data |
---|
34 | // |
---|
35 | // gEncodingNameMap |
---|
36 | // This array maps the Encodings enum values to their canonical names. |
---|
37 | // Be sure to keep this in sync with that enum! |
---|
38 | // --------------------------------------------------------------------------- |
---|
39 | static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] = |
---|
40 | { |
---|
41 | XMLUni::fgEBCDICEncodingString |
---|
42 | , XMLUni::fgUCS4BEncodingString |
---|
43 | , XMLUni::fgUCS4LEncodingString |
---|
44 | , XMLUni::fgUSASCIIEncodingString |
---|
45 | , XMLUni::fgUTF8EncodingString |
---|
46 | , XMLUni::fgUTF16BEncodingString |
---|
47 | , XMLUni::fgUTF16LEncodingString |
---|
48 | , XMLUni::fgXMLChEncodingString |
---|
49 | }; |
---|
50 | |
---|
51 | |
---|
52 | |
---|
53 | // --------------------------------------------------------------------------- |
---|
54 | // XMLRecognizer: Public, const static data |
---|
55 | // |
---|
56 | // gXXXPre |
---|
57 | // gXXXPreLen |
---|
58 | // The byte sequence prefixes for all of the encodings that we can |
---|
59 | // auto sense. Also included is the length of each sequence. |
---|
60 | // --------------------------------------------------------------------------- |
---|
61 | const char XMLRecognizer::fgASCIIPre[] = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 }; |
---|
62 | const XMLSize_t XMLRecognizer::fgASCIIPreLen = 6; |
---|
63 | const XMLByte XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 }; |
---|
64 | const XMLSize_t XMLRecognizer::fgEBCDICPreLen = 6; |
---|
65 | const XMLByte XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 }; |
---|
66 | const XMLByte XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 }; |
---|
67 | const XMLSize_t XMLRecognizer::fgUTF16PreLen = 12; |
---|
68 | const XMLByte XMLRecognizer::fgUCS4BPre[] = |
---|
69 | { |
---|
70 | 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F |
---|
71 | , 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D |
---|
72 | , 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20 |
---|
73 | }; |
---|
74 | const XMLByte XMLRecognizer::fgUCS4LPre[] = |
---|
75 | { |
---|
76 | 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00 |
---|
77 | , 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00 |
---|
78 | , 0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00 |
---|
79 | }; |
---|
80 | const XMLSize_t XMLRecognizer::fgUCS4PreLen = 24; |
---|
81 | |
---|
82 | const char XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF}; |
---|
83 | const XMLSize_t XMLRecognizer::fgUTF8BOMLen = 3; |
---|
84 | |
---|
85 | // --------------------------------------------------------------------------- |
---|
86 | // XMLRecognizer: Encoding recognition methods |
---|
87 | // --------------------------------------------------------------------------- |
---|
88 | XMLRecognizer::Encodings |
---|
89 | XMLRecognizer::basicEncodingProbe( const XMLByte* const rawBuffer |
---|
90 | , const XMLSize_t rawByteCount) |
---|
91 | { |
---|
92 | // |
---|
93 | // As an optimization to check the 90% case, check first for the ASCII |
---|
94 | // sequence '<?xml', which means its either US-ASCII, UTF-8, or some |
---|
95 | // other encoding that we don't do manually but which happens to share |
---|
96 | // the US-ASCII code points for these characters. So just return UTF-8 |
---|
97 | // to get us through the first line. |
---|
98 | // |
---|
99 | if (rawByteCount >= fgASCIIPreLen) |
---|
100 | { |
---|
101 | if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen)) |
---|
102 | return UTF_8; |
---|
103 | } |
---|
104 | |
---|
105 | // |
---|
106 | // If the count of raw bytes is less than 2, it cannot be anything |
---|
107 | // we understand, so return UTF-8 as a fallback. |
---|
108 | // |
---|
109 | if (rawByteCount < 2) |
---|
110 | return UTF_8; |
---|
111 | |
---|
112 | // |
---|
113 | // We have two to four bytes, so lets check for a UTF-16 BOM. That |
---|
114 | // is quick to check and enough to identify two major encodings. |
---|
115 | // |
---|
116 | |
---|
117 | if (rawByteCount < 4) |
---|
118 | { |
---|
119 | if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) |
---|
120 | return UTF_16B; |
---|
121 | else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) |
---|
122 | return UTF_16L; |
---|
123 | else |
---|
124 | return UTF_8; |
---|
125 | } |
---|
126 | |
---|
127 | /*** |
---|
128 | * F.1 Detection Without External Encoding Information |
---|
129 | * |
---|
130 | * Because each XML entity not accompanied by external encoding information and |
---|
131 | * not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, |
---|
132 | * in which the first characters must be '<?xml', any conforming processor can detect, |
---|
133 | * after two to four octets of input, which of the following cases apply. |
---|
134 | * |
---|
135 | * In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and |
---|
136 | * '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is |
---|
137 | * "#xFEFF". The notation ## is used to denote any byte value except that two consecutive |
---|
138 | * ##s cannot be both 00. |
---|
139 | * |
---|
140 | * With a Byte Order Mark: |
---|
141 | * |
---|
142 | * 00 00 FE FF UCS-4, big-endian machine (1234 order) |
---|
143 | * FF FE 00 00 UCS-4, little-endian machine (4321 order) |
---|
144 | * 00 00 FF FE UCS-4, unusual octet order (2143) |
---|
145 | * FE FF 00 00 UCS-4, unusual octet order (3412) |
---|
146 | * FE FF ## ## UTF-16, big-endian |
---|
147 | * FF FE ## ## UTF-16, little-endian |
---|
148 | * EF BB BF UTF-8 |
---|
149 | * |
---|
150 | ***/ |
---|
151 | |
---|
152 | // |
---|
153 | // We have at least four bytes, so we can check all BOM |
---|
154 | // for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well. |
---|
155 | // |
---|
156 | if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF)) |
---|
157 | return UCS_4B; |
---|
158 | else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00)) |
---|
159 | return UCS_4L; |
---|
160 | else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) |
---|
161 | return UTF_16B; |
---|
162 | else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) |
---|
163 | return UTF_16L; |
---|
164 | |
---|
165 | // |
---|
166 | // We have at least 4 bytes. So lets check the 4 byte sequences that |
---|
167 | // indicate other UTF-16 and UCS encodings. |
---|
168 | // |
---|
169 | if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C)) |
---|
170 | { |
---|
171 | if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen)) |
---|
172 | return UCS_4B; |
---|
173 | else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen)) |
---|
174 | return UCS_4L; |
---|
175 | else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen)) |
---|
176 | return UTF_16B; |
---|
177 | else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen)) |
---|
178 | return UTF_16L; |
---|
179 | } |
---|
180 | |
---|
181 | // |
---|
182 | // See if we have enough bytes to possibly match the EBCDIC prefix. |
---|
183 | // If so, try it. |
---|
184 | // |
---|
185 | if (rawByteCount > fgEBCDICPreLen) |
---|
186 | { |
---|
187 | if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen)) |
---|
188 | return EBCDIC; |
---|
189 | } |
---|
190 | |
---|
191 | // |
---|
192 | // Does not seem to be anything we know, so go with UTF-8 to get at |
---|
193 | // least through the first line and see what it really is. |
---|
194 | // |
---|
195 | return UTF_8; |
---|
196 | } |
---|
197 | |
---|
198 | |
---|
199 | XMLRecognizer::Encodings |
---|
200 | XMLRecognizer::encodingForName(const XMLCh* const encName) |
---|
201 | { |
---|
202 | // |
---|
203 | // Compare the passed string, assume input string is already uppercased, |
---|
204 | // to the variations that we recognize. |
---|
205 | // |
---|
206 | // !!NOTE: Note that we don't handle EBCDIC here because we don't handle |
---|
207 | // that one ourselves. It is allowed to fall into 'other'. |
---|
208 | // |
---|
209 | if (encName == XMLUni::fgXMLChEncodingString || |
---|
210 | !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString)) |
---|
211 | { |
---|
212 | return XMLRecognizer::XERCES_XMLCH; |
---|
213 | } |
---|
214 | else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString) |
---|
215 | || !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2)) |
---|
216 | { |
---|
217 | return XMLRecognizer::UTF_8; |
---|
218 | } |
---|
219 | else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString) |
---|
220 | || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2) |
---|
221 | || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3) |
---|
222 | || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4)) |
---|
223 | { |
---|
224 | return XMLRecognizer::US_ASCII; |
---|
225 | } |
---|
226 | else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString) |
---|
227 | || !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2)) |
---|
228 | { |
---|
229 | return XMLRecognizer::UTF_16L; |
---|
230 | } |
---|
231 | else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString) |
---|
232 | || !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2)) |
---|
233 | { |
---|
234 | return XMLRecognizer::UTF_16B; |
---|
235 | } |
---|
236 | else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString)) |
---|
237 | { |
---|
238 | return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UTF_16B:XMLRecognizer::UTF_16L; |
---|
239 | } |
---|
240 | else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString) |
---|
241 | || !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2)) |
---|
242 | { |
---|
243 | return XMLRecognizer::UCS_4L; |
---|
244 | } |
---|
245 | else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString) |
---|
246 | || !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2)) |
---|
247 | { |
---|
248 | return XMLRecognizer::UCS_4B; |
---|
249 | } |
---|
250 | else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString)) |
---|
251 | { |
---|
252 | return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UCS_4B:XMLRecognizer::UCS_4L; |
---|
253 | } |
---|
254 | |
---|
255 | // Return 'other' since we don't recognizer it |
---|
256 | return XMLRecognizer::OtherEncoding; |
---|
257 | } |
---|
258 | |
---|
259 | |
---|
260 | const XMLCh* |
---|
261 | XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding |
---|
262 | , MemoryManager* const manager) |
---|
263 | { |
---|
264 | if (theEncoding >= Encodings_Count) |
---|
265 | ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager); |
---|
266 | |
---|
267 | return gEncodingNameMap[theEncoding]; |
---|
268 | } |
---|
269 | |
---|
270 | XERCES_CPP_NAMESPACE_END |
---|