source: icXML/icXML-devel/src/xercesc/framework/XMLRecognizer.cpp @ 2722

Last change on this file since 2722 was 2722, checked in by cameron, 6 years ago

Original Xerces files with import mods for icxercesc

File size: 10.6 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/**
19 *  $Id: XMLRecognizer.cpp 555320 2007-07-11 16:05:13Z amassari $
20 */
21
22
23// ---------------------------------------------------------------------------
24//  Includes
25// ---------------------------------------------------------------------------
26#include <icxercesc/framework/XMLRecognizer.hpp>
27#include <xercesc/util/RuntimeException.hpp>
28#include <icxercesc/util/XMLString.hpp>
29
30XERCES_CPP_NAMESPACE_BEGIN
31
32// ---------------------------------------------------------------------------
33//  Local data
34//
35//  gEncodingNameMap
36//      This array maps the Encodings enum values to their canonical names.
37//      Be sure to keep this in sync with that enum!
38// ---------------------------------------------------------------------------
39static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] =
40{
41    XMLUni::fgEBCDICEncodingString
42    , XMLUni::fgUCS4BEncodingString
43    , XMLUni::fgUCS4LEncodingString
44    , XMLUni::fgUSASCIIEncodingString
45    , XMLUni::fgUTF8EncodingString
46    , XMLUni::fgUTF16BEncodingString
47    , XMLUni::fgUTF16LEncodingString
48    , XMLUni::fgXMLChEncodingString
49};
50
51
52
53// ---------------------------------------------------------------------------
54//  XMLRecognizer: Public, const static data
55//
56//  gXXXPre
57//  gXXXPreLen
58//      The byte sequence prefixes for all of the encodings that we can
59//      auto sense. Also included is the length of each sequence.
60// ---------------------------------------------------------------------------
61const char           XMLRecognizer::fgASCIIPre[]  = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 };
62const XMLSize_t      XMLRecognizer::fgASCIIPreLen = 6;
63const XMLByte        XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 };
64const XMLSize_t      XMLRecognizer::fgEBCDICPreLen = 6;
65const XMLByte        XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 };
66const XMLByte        XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 };
67const XMLSize_t      XMLRecognizer::fgUTF16PreLen = 12;
68const XMLByte        XMLRecognizer::fgUCS4BPre[]  =
69{
70        0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F
71    ,   0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D
72    ,   0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20
73};
74const XMLByte        XMLRecognizer::fgUCS4LPre[]  =
75{
76        0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00
77    ,   0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00
78    ,   0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00
79};
80const XMLSize_t      XMLRecognizer::fgUCS4PreLen = 24;
81
82const char           XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF};
83const XMLSize_t      XMLRecognizer::fgUTF8BOMLen = 3;
84
85// ---------------------------------------------------------------------------
86//  XMLRecognizer: Encoding recognition methods
87// ---------------------------------------------------------------------------
88XMLRecognizer::Encodings
89XMLRecognizer::basicEncodingProbe(  const   XMLByte* const  rawBuffer
90                                    , const XMLSize_t       rawByteCount)
91{
92    //
93    //  As an optimization to check the 90% case, check first for the ASCII
94    //  sequence '<?xml', which means its either US-ASCII, UTF-8, or some
95    //  other encoding that we don't do manually but which happens to share
96    //  the US-ASCII code points for these characters. So just return UTF-8
97    //  to get us through the first line.
98    //
99    if (rawByteCount >= fgASCIIPreLen)
100    {
101        if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen))
102            return UTF_8;
103    }
104
105    //
106    //  If the count of raw bytes is less than 2, it cannot be anything
107    //  we understand, so return UTF-8 as a fallback.
108    //
109    if (rawByteCount < 2)
110        return UTF_8;
111         
112    // 
113    //  We have two to four bytes, so lets check for a UTF-16 BOM. That
114    //  is quick to check and enough to identify two major encodings.   
115    //
116
117    if (rawByteCount < 4)
118    {
119        if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
120            return UTF_16B;
121        else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
122            return UTF_16L;
123        else 
124            return UTF_8;
125    }
126
127    /***
128     *    F.1 Detection Without External Encoding Information
129     *
130     *    Because each XML entity not accompanied by external encoding information and
131     *    not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration,
132     *    in which the first characters must be '<?xml', any conforming processor can detect,
133     *    after two to four octets of input, which of the following cases apply.
134     *
135     *    In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and
136     *    '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is
137     *    "#xFEFF". The notation ## is used to denote any byte value except that two consecutive
138     *    ##s cannot be both 00.
139     *
140     *    With a Byte Order Mark:
141     *
142     *    00 00 FE FF           UCS-4,    big-endian machine    (1234 order)
143     *    FF FE 00 00           UCS-4,    little-endian machine (4321 order)
144     *    00 00 FF FE           UCS-4,    unusual octet order   (2143)
145     *    FE FF 00 00           UCS-4,    unusual octet order   (3412)
146     *    FE FF ## ##           UTF-16,   big-endian
147     *    FF FE ## ##           UTF-16,   little-endian
148     *    EF BB BF              UTF-8
149     *
150     ***/
151
152    //
153    //  We have at least four bytes, so we can check all BOM
154    //  for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well.
155    //
156    if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF))
157        return UCS_4B;
158    else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00))
159        return UCS_4L;
160    else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
161        return UTF_16B;
162    else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
163        return UTF_16L;
164
165    //
166    //  We have at least 4 bytes. So lets check the 4 byte sequences that
167    //  indicate other UTF-16 and UCS encodings.
168    //
169    if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C))
170    {
171        if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen))
172            return UCS_4B;
173        else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen))
174            return UCS_4L;
175        else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen))
176            return UTF_16B;
177        else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen))
178            return UTF_16L;
179    }
180
181    //
182    //  See if we have enough bytes to possibly match the EBCDIC prefix.
183    //  If so, try it.
184    //
185    if (rawByteCount > fgEBCDICPreLen)
186    {
187        if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen))
188            return EBCDIC;
189    }
190
191    //
192    //  Does not seem to be anything we know, so go with UTF-8 to get at
193    //  least through the first line and see what it really is.
194    //
195    return UTF_8;
196}
197
198
199XMLRecognizer::Encodings
200XMLRecognizer::encodingForName(const XMLCh* const encName)
201{
202    //
203    //  Compare the passed string, assume input string is already uppercased,
204    //  to the variations that we recognize.
205    //
206    //  !!NOTE: Note that we don't handle EBCDIC here because we don't handle
207    //  that one ourselves. It is allowed to fall into 'other'.
208    //
209    if (encName == XMLUni::fgXMLChEncodingString ||
210        !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString))
211    {
212        return XMLRecognizer::XERCES_XMLCH;
213    }
214    else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString)
215         ||  !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2))
216    {
217        return XMLRecognizer::UTF_8;
218    }
219    else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString)
220         ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2)
221         ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3)
222         ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4))
223    {
224        return XMLRecognizer::US_ASCII;
225    }
226    else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString)
227         ||  !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2))
228    {
229        return XMLRecognizer::UTF_16L;
230    }
231    else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString)
232         ||  !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2))
233    {
234        return XMLRecognizer::UTF_16B;
235    }
236    else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString))
237    {
238        return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UTF_16B:XMLRecognizer::UTF_16L;
239    }
240    else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString)
241         ||  !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2))
242    {
243        return XMLRecognizer::UCS_4L;
244    }
245    else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString)
246         ||  !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2))
247    {
248        return XMLRecognizer::UCS_4B;
249    }
250    else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString))
251    {
252        return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UCS_4B:XMLRecognizer::UCS_4L;
253    }
254
255    // Return 'other' since we don't recognizer it
256    return XMLRecognizer::OtherEncoding;
257}
258
259
260const XMLCh*
261XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding
262                               , MemoryManager* const manager)
263{
264    if (theEncoding >= Encodings_Count)
265        ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager);
266
267    return gEncodingNameMap[theEncoding];
268}
269
270XERCES_CPP_NAMESPACE_END
Note: See TracBrowser for help on using the repository browser.