source: trunk/src/xmlmodel.h @ 137

Last change on this file since 137 was 137, checked in by lindanl, 11 years ago

ATT_info: emumValues using nameID.

File size: 6.5 KB
Line 
1/*  xmlmodel.h - XML Model Processor
2    Copyright (c) 2007, 2008 Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The XML Model Processor gathers information that guides
8    interpretation of an XML document as it is processed.
9    This information arises from a variety of sources,
10    including:
11      (a) the document prolog, including
12          (a1) the encoding signature,
13          (a2) the XML declaration (or text declaration for
14               external entities), and
15          (a3) the Document Type Definition (internal and
16               external subsets).
17      FUTURE:
18      (b) XML Schema documents (and/or Relax NG, Schematron)
19      (c) XPath sets specifying information to retrieve.
20*/
21
22#ifndef XMLMODEL_H
23#define XMLMODEL_H
24#include <vector>
25#include <iostream>
26#include <string>
27#include <ext/hash_map>
28
29using namespace __gnu_cxx;
30using namespace std;
31
32#include "contentmodel.h"
33#include "symtab.h"
34enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
35/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
36   or there may be no XML version declared ("no value" in the
37   XML infoset parlance). */
38
39enum CodeUnit_Base {ASCII, EBCDIC};
40/* Code units of the underlying character set may be either ASCII-compatible
41   or EBCDIC-compatible.
42   ASCII-compatibility means that any code units satisfy the following properties.
43     (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
44         is a complete character sequence (single code unit sequence) representing
45         that ASCII character.
46     (2) Any code units above the ASCII range are non-ASCII code units.
47         No code units or code unit sequences containing a non-ASCII code unit
48         may represent an ASCII character.  (This property ensures that
49         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
50   EBCDIC-compatible, for the purposes of XML, means that the following property
51         applies.
52     (*) Code units may form all or part of a code unit sequence representing
53         a character in the Unicode range 0 to 0x9F if and only if that code
54         unit has the same interpretation unde the basic EBCDIC code page cp037.
55*/
56
57enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
58/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
59   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
60   The UTF-32/UCS-4 family has 32-bit code units. */
61
62enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
63/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
64   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
65               UTF-16 without a byte order mark,
66               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
67   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
68                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
69   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
70   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
71*/
72
73enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
74/* Possible values depending on the optional standalone component of an
75   XML declaration. */
76
77
78/* Attribute Modeling */
79
80enum ATT_type {CDATA_att, ID_att, IDREF_att, IDREFS_att, ENTITY_att, ENTITIES_att, 
81               NMTOKEN_att, NMTOKENS_att, NOTATION_att, enumeration_att};
82/* Possible attribute types as specified in ATTLIST declarations. */
83
84enum ATT_default_kind {REQUIRED_att, IMPLIED_att, FIXED_att, DEFAULT_att};
85/* Possible kinds of attribute default in ATTLIST declarations. */
86
87
88class ATT_info {
89public:
90        int globalATT_id;
91        ATT_type attType;
92        hash_map<int, int > enumValues; /* For NOTATION_att or enumeration_att.*/
93        ATT_default_kind defaultKind;
94        unsigned char * defaultValue;
95        int defaultValueLgth;
96};
97
98
99class GEntity_info {
100public:
101        int globalGEntity_id;
102        bool is_external;
103        char * ReplacementText;
104        char * systemLiteral;
105        char * pubidLiteral;   
106        char * NDataName;
107        bool is_simple;
108       
109};
110
111class PEntity_info {
112public:
113        int globalPEntity_id;
114        bool is_external;
115        char * ReplacementText;
116        char * systemLiteral;
117        char * pubidLiteral;   
118};
119
120/* The complete Attribute model for a given element is a vector of ATT_info
121   specifications for particular attribute names. */
122//typedef vector<ATT_info> ElementAttributeModel;
123
124
125class Notation_info {
126public:
127        char * systemLiteral;
128        char * pubidLiteral;   
129};
130
131
132class Entity_Info {
133       
134public: 
135        Entity_Info();
136        ~Entity_Info();
137
138        /*  Information computed by analyzing the 4-byte initial signature
139            of an XML document. */
140        int BOM_units; /* no of initial code units for a Byte Order Mark */
141
142        CodeUnit_Base code_unit_base;
143        CodeUnit_Size code_unit_size;
144        CodeUnit_ByteOrder byte_order; 
145
146        void AnalyzeSignature(unsigned char * signature);
147
148        /* Information computed from the XML or text declaration. */
149        XML_version version;
150        bool has_encoding_decl;
151        unsigned char * encoding;
152        XML_standalone standalone;
153       
154private:
155        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
156};
157
158class Model_Info {
159       
160public: 
161        Model_Info();
162        ~Model_Info();
163        bool has_external_DTD;
164        char * external_DTD_systemLiteral;
165        char * external_DTD_pubidLiteral;       
166        Symbol_Table * symbol_table;
167   
168        /* Information computed from ATTLIST, ELEMENT, NOTATION and ENTITY declarations. */
169
170        hash_map<int, int > GlobalAttributeTable;
171        hash_map<int, int > GlobalElementTable;
172        hash_map<int, int > GlobalNotationTable;
173        hash_map<int, int > GlobalGEntityTable;
174        hash_map<int, int > GlobalPEntityTable;
175       
176       
177        int globalElementCount;
178        int globalAttributeCount;
179        int globalNotationCount;
180        int globalGEntityCount;
181        int globalPEntityCount;
182    /* For each element, we have an ElementAttributeModel */
183        vector<vector<ATT_info *> > ElementAttributeData;
184        int getOrInsertGlobalElement(int elem_nameID);
185        int getOrInsertGlobalAttName(int att_nameID);
186        // rootModel is a content model for the document root, consisting
187        // of a single occurrence of the element named in the DOCTYPE declaration.
188        CM_RegExp * rootModel;
189//      vector<ContentModel *> ContentModelData;
190        hash_map<int, ContentModel * > ContentModelData;
191       
192       
193        vector<GEntity_info *> GEntityData;
194        vector<PEntity_info *> PEntityData;
195        vector<Notation_info *> NotationData;
196       
197        void SimpleEntity(char * entity_Name, char * replText);
198};
199
200#endif /*XMLMODEL_H*/
Note: See TracBrowser for help on using the repository browser.