source: trunk/src/xmlmodel.h @ 115

Last change on this file since 115 was 115, checked in by lindanl, 11 years ago

various error checks in parsing

File size: 6.8 KB
Line 
1/*  xmlmodel.h - XML Model Processor
2    Copyright (c) 2007, 2008 Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The XML Model Processor gathers information that guides
8    interpretation of an XML document as it is processed.
9    This information arises from a variety of sources,
10    including:
11      (a) the document prolog, including
12          (a1) the encoding signature,
13          (a2) the XML declaration (or text declaration for
14               external entities), and
15          (a3) the Document Type Definition (internal and
16               external subsets).
17      FUTURE:
18      (b) XML Schema documents (and/or Relax NG, Schematron)
19      (c) XPath sets specifying information to retrieve.
20*/
21
22#ifndef XMLMODEL_H
23#define XMLMODEL_H
24#include <vector>
25#include <iostream>
26#include <string>
27#include <ext/hash_map>
28
29using namespace __gnu_cxx;
30using namespace std;
31
32#include "contentmodel.h"
33enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
34/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
35   or there may be no XML version declared ("no value" in the
36   XML infoset parlance). */
37
38enum CodeUnit_Base {ASCII, EBCDIC};
39/* Code units of the underlying character set may be either ASCII-compatible
40   or EBCDIC-compatible.
41   ASCII-compatibility means that any code units satisfy the following properties.
42     (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
43         is a complete character sequence (single code unit sequence) representing
44         that ASCII character.
45     (2) Any code units above the ASCII range are non-ASCII code units.
46         No code units or code unit sequences containing a non-ASCII code unit
47         may represent an ASCII character.  (This property ensures that
48         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
49   EBCDIC-compatible, for the purposes of XML, means that the following property
50         applies.
51     (*) Code units may form all or part of a code unit sequence representing
52         a character in the Unicode range 0 to 0x9F if and only if that code
53         unit has the same interpretation unde the basic EBCDIC code page cp037.
54*/
55
56enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
57/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
58   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
59   The UTF-32/UCS-4 family has 32-bit code units. */
60
61enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
62/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
63   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
64               UTF-16 without a byte order mark,
65               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
66   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
67                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
68   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
69   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
70*/
71
72enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
73/* Possible values depending on the optional standalone component of an
74   XML declaration. */
75
76
77/* Attribute Modeling */
78
79enum ATT_type {CDATA_att, ID_att, IDREF_att, IDREFS_att, ENTITY_att, ENTITIES_att, 
80               NMTOKEN_att, NMTOKENS_att, NOTATION_att, enumeration_att};
81/* Possible attribute types as specified in ATTLIST declarations. */
82
83enum ATT_default_kind {REQUIRED_att, IMPLIED_att, FIXED_att, DEFAULT_att};
84/* Possible kinds of attribute default in ATTLIST declarations. */
85
86struct eqstr
87{
88  bool operator()(const char* s1, const char* s2) const
89  {
90    return strcmp(s1, s2) == 0;
91  }
92};
93
94class ATT_info {
95public:
96        int globalATT_id;
97        ATT_type attType;
98        hash_map<const char *, int, hash<const char *>, eqstr > enumValues; /* For NOTATION_att or enumeration_att.*/
99        ATT_default_kind defaultKind;
100        unsigned char * defaultValue;
101        int defaultValueLgth;
102};
103
104
105class GEntity_info {
106public:
107        int globalGEntity_id;
108        bool is_external;
109        char * ReplacementText;
110        char * systemLiteral;
111        char * pubidLiteral;   
112        char * NDataName;
113        bool is_simple;
114       
115};
116
117class PEntity_info {
118public:
119        int globalPEntity_id;
120        bool is_external;
121        char * ReplacementText;
122        char * systemLiteral;
123        char * pubidLiteral;   
124};
125
126/* The complete Attribute model for a given element is a vector of ATT_info
127   specifications for particular attribute names. */
128//typedef vector<ATT_info> ElementAttributeModel;
129
130
131class Notation_info {
132public:
133        char * systemLiteral;
134        char * pubidLiteral;   
135};
136
137
138class Entity_Info {
139       
140public: 
141        Entity_Info();
142        ~Entity_Info();
143
144        /*  Information computed by analyzing the 4-byte initial signature
145            of an XML document. */
146        int BOM_units; /* no of initial code units for a Byte Order Mark */
147
148        CodeUnit_Base code_unit_base;
149        CodeUnit_Size code_unit_size;
150        CodeUnit_ByteOrder byte_order; 
151
152        void AnalyzeSignature(unsigned char * signature);
153
154        /* Information computed from the XML or text declaration. */
155        XML_version version;
156        bool has_encoding_decl;
157        unsigned char * encoding;
158        XML_standalone standalone;
159       
160private:
161        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
162};
163
164class Model_Info {
165       
166public: 
167        Model_Info();
168        ~Model_Info();
169        bool has_external_DTD;
170        char * external_DTD_systemLiteral;
171        char * external_DTD_pubidLiteral;       
172
173   
174        /* Information computed from ATTLIST, ELEMENT, NOTATION and ENTITY declarations. */
175        hash_map<const char *, int, hash<const char *>, eqstr > GlobalAttributeTable;
176        hash_map<const char *, int, hash<const char *>, eqstr > GlobalElementTable;
177        hash_map<const char *, int, hash<const char *>, eqstr > GlobalNotationTable;
178        hash_map<const char *, int, hash<const char *>, eqstr > GlobalGEntityTable;
179        hash_map<const char *, int, hash<const char *>, eqstr > GlobalPEntityTable;
180        int globalElementCount;
181        int globalAttributeCount;
182        int globalNotationCount;
183        int globalGEntityCount;
184        int globalPEntityCount;
185       
186    /* For each element, we have an ElementAttributeModel */
187        vector<vector<ATT_info *> > ElementAttributeData;
188        int getOrInsertGlobalElement(unsigned char * elem_name, int lgth);
189        int getOrInsertGlobalAttName(unsigned char * att_name, int lgth);
190        // rootModel is a content model for the document root, consisting
191        // of a single occurrence of the element named in the DOCTYPE declaration.
192        CM_RegExp * rootModel;
193        vector<ContentModel *> ContentModelData;
194       
195       
196        vector<GEntity_info *> GEntityData;
197        vector<PEntity_info *> PEntityData;
198        vector<Notation_info *> NotationData;
199       
200        void SimpleEntity(char * entity_Name, char * replText);
201};
202
203#endif /*XMLMODEL_H*/
Note: See TracBrowser for help on using the repository browser.