source: trunk/src/xmldecl.h @ 1932

Last change on this file since 1932 was 529, checked in by cameron, 9 years ago

Encoding Name validation.

File size: 4.2 KB
Line 
1/*  xmldecl.h - Parsing XML and text declarations.
2    Copyright (c) 2008,  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef XML_DECL_H
9#define XML_DECL_H
10
11enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
12/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
13   or there may be no XML version declared ("no value" in the
14   XML infoset parlance). */
15
16enum CodeUnit_Base {ASCII, EBCDIC};
17
18/* Code units of the underlying character set may be either ASCII-compatible
19   or EBCDIC-compatible.
20   ASCII-compatibility means that any code units satisfy the following properties.
21     (1) Any code unit whose numeric value is in the ASinclude "byteplex.h"CII range (0 to 0x7F)
22         is a complete character sequence (single code unit sequence) representing
23         that ASCII character.
24     (2) Any code units above the ASCII range are non-ASCII code units.
25         No code units or code unit sequences containing a non-ASCII code unit
26         may represent an ASCII character.  (This property ensures that
27         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
28   EBCDIC-compatible, for the purposes of XML, means that the following property
29         applies.include "byteplex.h"
30
31     (*) Code units may form all or part of a code unit sequence representing
32         a character in the Unicode range 0 to 0x9F if and only if that code
33         unit has the same interpretation unde the basic EBCDIC code page cp037.
34*/
35
36enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
37/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
38   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
39   The UTF-32/UCS-4 family has 32-bit code units. */
40
41enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
42/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
43   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
44               UTF-16 without a byte order mark,
45               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
46   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
47                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
48   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
49   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
50*/
51
52enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
53/* Possible values depending on the optional standalone component of an
54   XML declaration. */
55
56class Entity_Info {
57       
58public: 
59        Entity_Info();
60        ~Entity_Info();
61
62        /*  Information computed by analyzing the 4-byte initial signature
63            of an XML document. */
64        int BOM_units; /* no of initial code units for a Byte Order Mark */
65
66        CodeUnit_Base code_unit_base;
67        CodeUnit_Size code_unit_size;
68        CodeUnit_ByteOrder byte_order; 
69
70        void AnalyzeSignature(unsigned char * signature);
71
72        /* Information computed from the XML or text declaration. */
73        XML_version version;
74        bool has_encoding_decl;
75        unsigned char * encoding;
76        XML_standalone standalone;
77        int content_start;  /* position after BOM and XML/text decl.*/
78       
79private:
80        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
81};
82
83
84#include "byteplex.h"
85
86template <CodeUnit_Base C>
87class XML_Decl_Parser {
88public:
89        XML_Decl_Parser (Byteplex * b);
90        ~XML_Decl_Parser ();
91       
92        void ReadXMLInfo(Entity_Info & e);
93        void ReadTextDeclaration(Entity_Info & e);
94        // Generic version if type of external entity unknown.
95        void ReadXMLorTextDecl(Entity_Info & e);
96       
97
98protected:
99
100        Byteplex * byteplex;
101        unsigned char * x8data;
102        int buffer_base_pos;
103        int buffer_rel_pos;
104        int buffer_limit_pos;
105       
106        void Advance(int n);
107        int AbsPos() const;
108        unsigned char * cur() const;
109       
110private:
111        /* Bytespace parsing routines for internal use in ReadXMLInfo and
112           ReadTextDeclaration. */
113        void DeclError();
114        void Scan_WS();
115        void ScanToEncodingName();
116        void ParseVersion(Entity_Info & e);
117        void ParseEncoding(Entity_Info & e);
118        void ParseStandalone(Entity_Info & e);
119
120};
121#endif
Note: See TracBrowser for help on using the repository browser.