source: trunk/src/xmlmodel.h @ 100

Last change on this file since 100 was 100, checked in by lindanl, 11 years ago

Parsers for internal and external entities.

File size: 6.3 KB
Line 
1/*  xmlmodel.h - XML Model Processor
2    Copyright (c) 2007, 2008 Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The XML Model Processor gathers information that guides
8    interpretation of an XML document as it is processed.
9    This information arises from a variety of sources,
10    including:
11      (a) the document prolog, including
12          (a1) the encoding signature,
13          (a2) the XML declaration (or text declaration for
14               external entities), and
15          (a3) the Document Type Definition (internal and
16               external subsets).
17      FUTURE:
18      (b) XML Schema documents (and/or Relax NG, Schematron)
19      (c) XPath sets specifying information to retrieve.
20*/
21
22#ifndef XMLMODEL_H
23#define XMLMODEL_H
24#include <vector>
25#include <iostream>
26#include <string>
27#include <ext/hash_map>
28
29using namespace __gnu_cxx;
30using namespace std;
31
32enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
33/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
34   or there may be no XML version declared ("no value" in the
35   XML infoset parlance). */
36
37enum CodeUnit_Base {ASCII, EBCDIC};
38/* Code units of the underlying character set may be either ASCII-compatible
39   or EBCDIC-compatible.
40   ASCII-compatibility means that any code units satisfy the following properties.
41     (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
42         is a complete character sequence (single code unit sequence) representing
43         that ASCII character.
44     (2) Any code units above the ASCII range are non-ASCII code units.
45         No code units or code unit sequences containing a non-ASCII code unit
46         may represent an ASCII character.  (This property ensures that
47         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
48   EBCDIC-compatible, for the purposes of XML, means that the following property
49         applies.
50     (*) Code units may form all or part of a code unit sequence representing
51         a character in the Unicode range 0 to 0x9F if and only if that code
52         unit has the same interpretation unde the basic EBCDIC code page cp037.
53*/
54
55enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
56/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
57   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
58   The UTF-32/UCS-4 family has 32-bit code units. */
59
60enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
61/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
62   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
63               UTF-16 without a byte order mark,
64               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
65   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
66                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
67   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
68   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
69*/
70
71enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
72/* Possible values depending on the optional standalone component of an
73   XML declaration. */
74
75
76/* Attribute Modeling */
77
78enum ATT_type {CDATA_att, ID_att, IDREF_att, IDREFS_att, ENTITY_att, ENTITIES_att, 
79               NMTOKEN_att, NMTOKENS_att, NOTATION_att, enumeration_att};
80/* Possible attribute types as specified in ATTLIST declarations. */
81
82enum ATT_default_kind {REQUIRED_att, IMPLIED_att, FIXED_att, DEFAULT_att};
83/* Possible kinds of attribute default in ATTLIST declarations. */
84
85struct eqstr
86{
87  bool operator()(const char* s1, const char* s2) const
88  {
89    return strcmp(s1, s2) == 0;
90  }
91};
92
93class ATT_info {
94public:
95        int globalATT_id;
96        ATT_type attType;
97        hash_map<const char *, int, hash<const char *>, eqstr > enumValues; /* For NOTATION_att or enumeration_att.*/
98        ATT_default_kind defaultKind;
99        unsigned char * defaultValue;
100        int defaultValueLgth;
101};
102
103
104class GEntity_info {
105public:
106        int globalGEntity_id;
107        bool is_external;
108        char * ReplacementText;
109        char * systemLiteral;
110        char * pubidLiteral;   
111        char * NDataName;
112        bool is_simple;
113};
114
115class PEntity_info {
116public:
117        int globalPEntity_id;
118        bool is_external;
119        char * ReplacementText;
120        char * systemLiteral;
121        char * pubidLiteral;   
122};
123
124/* The complete Attribute model for a given element is a vector of ATT_info
125   specifications for particular attribute names. */
126//typedef vector<ATT_info> ElementAttributeModel;
127
128
129
130class Entity_Info {
131       
132public: 
133        Entity_Info();
134        ~Entity_Info();
135
136        /*  Information computed by analyzing the 4-byte initial signature
137            of an XML document. */
138        int BOM_units; /* no of initial code units for a Byte Order Mark */
139
140        CodeUnit_Base code_unit_base;
141        CodeUnit_Size code_unit_size;
142        CodeUnit_ByteOrder byte_order; 
143
144        void AnalyzeSignature(unsigned char * signature);
145
146        /* Information computed from the XML or text declaration. */
147        XML_version version;
148        bool has_encoding_decl;
149        unsigned char * encoding;
150        XML_standalone standalone;
151       
152private:
153        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
154};
155
156class Model_Info {
157       
158public: 
159        Model_Info();
160        ~Model_Info();
161   
162        /* Information computed from ATTLIST, ELEMENT, NOTATION and ENTITY declarations. */
163        hash_map<const char *, int, hash<const char *>, eqstr > GlobalAttributeTable;
164        hash_map<const char *, int, hash<const char *>, eqstr > GlobalElementTable;
165        hash_map<const char *, int, hash<const char *>, eqstr > GlobalNotationTable;
166        hash_map<const char *, int, hash<const char *>, eqstr > GlobalGEntityTable;
167        hash_map<const char *, int, hash<const char *>, eqstr > GlobalPEntityTable;
168        int globalElementCount;
169        int globalAttributeCount;
170        int globalNotationCount;
171        int globalGEntityCount;
172        int globalPEntityCount;
173       
174    /* For each element, we have an ElementAttributeModel */
175        vector<vector<ATT_info *> > ElementAttributeData;
176        int getOrInsertGlobalElement(unsigned char * elem_name, int lgth);
177        int getOrInsertGlobalAttName(unsigned char * att_name, int lgth);
178       
179        vector<GEntity_info *> GEntityData;
180        vector<PEntity_info *> PEntityData;
181
182};
183
184#endif /*XMLMODEL_H*/
Note: See TracBrowser for help on using the repository browser.