source: trunk/src/xmlmodel.h @ 87

Last change on this file since 87 was 67, checked in by cameron, 11 years ago

Model Processor refactoring

File size: 3.9 KB
Line 
1/*  xmlmodel.h - XML Model Processor
2    Copyright (c) 2007, 2008 Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The XML Model Processor gathers information that guides
8    interpretation of an XML document as it is processed.
9    This information arises from a variety of sources,
10    including:
11      (a) the document prolog, including
12          (a1) the encoding signature,
13          (a2) the XML declaration (or text declaration for
14               external entities), and
15          (a3) the Document Type Definition (internal and
16               external subsets).
17      FUTURE:
18      (b) XML Schema documents (and/or Relax NG, Schematron)
19      (c) XPath sets specifying information to retrieve.
20*/
21
22#ifndef XMLMODEL_H
23#define XMLMODEL_H
24
25
26enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
27/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
28   or there may be no XML version declared ("no value" in the
29   XML infoset parlance). */
30
31enum CodeUnit_Base {ASCII, EBCDIC};
32/* Code units of the underlying character set may be either ASCII-compatible
33   or EBCDIC-compatible.
34   ASCII-compatibility means that any code units satisfy the following properties.
35     (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
36         is a complete character sequence (single code unit sequence) representing
37         that ASCII character.
38     (2) Any code units above the ASCII range are non-ASCII code units.
39         No code units or code unit sequences containing a non-ASCII code unit
40         may represent an ASCII character.  (This property ensures that
41         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
42   EBCDIC-compatible, for the purposes of XML, means that the following property
43         applies.
44     (*) Code units may form all or part of a code unit sequence representing
45         a character in the Unicode range 0 to 0x9F if and only if that code
46         unit has the same interpretation unde the basic EBCDIC code page cp037.
47*/
48
49enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
50/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
51   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
52   The UTF-32/UCS-4 family has 32-bit code units. */
53
54enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
55/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
56   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
57               UTF-16 without a byte order mark,
58               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
59   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
60                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
61   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
62   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
63*/
64
65enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
66/* Possible values depending on the optional standalone component of an
67   XML declaration. */
68
69
70class Model_Info {
71       
72public: 
73        ~Model_Info();
74
75        /*  Information computed by analyzing the 4-byte initial signature
76            of an XML document. */
77        int BOM_units; /* no of initial code units for a Byte Order Mark */
78
79        CodeUnit_Base code_unit_base;
80        CodeUnit_Size code_unit_size;
81        CodeUnit_ByteOrder byte_order; 
82
83        void AnalyzeSignature(unsigned char * signature);
84
85        /* Information computed from the XML or text declaration. */
86        XML_version version;
87        bool has_encoding_decl;
88        unsigned char * encoding;
89        XML_standalone standalone;
90
91private:
92        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
93};
94
95#endif /*XMLMODEL_H*/
Note: See TracBrowser for help on using the repository browser.