source: trunk/src/xmlmodel.h @ 102

Last change on this file since 102 was 102, checked in by cameron, 11 years ago

Initialize predefined entities lt, gt, amp, apos, quot.

File size: 6.3 KB
Line 
1/*  xmlmodel.h - XML Model Processor
2    Copyright (c) 2007, 2008 Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The XML Model Processor gathers information that guides
8    interpretation of an XML document as it is processed.
9    This information arises from a variety of sources,
10    including:
11      (a) the document prolog, including
12          (a1) the encoding signature,
13          (a2) the XML declaration (or text declaration for
14               external entities), and
15          (a3) the Document Type Definition (internal and
16               external subsets).
17      FUTURE:
18      (b) XML Schema documents (and/or Relax NG, Schematron)
19      (c) XPath sets specifying information to retrieve.
20*/
21
22#ifndef XMLMODEL_H
23#define XMLMODEL_H
24#include <vector>
25#include <iostream>
26#include <string>
27#include <ext/hash_map>
28
29using namespace __gnu_cxx;
30using namespace std;
31
32enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
33/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
34   or there may be no XML version declared ("no value" in the
35   XML infoset parlance). */
36
37enum CodeUnit_Base {ASCII, EBCDIC};
38/* Code units of the underlying character set may be either ASCII-compatible
39   or EBCDIC-compatible.
40   ASCII-compatibility means that any code units satisfy the following properties.
41     (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
42         is a complete character sequence (single code unit sequence) representing
43         that ASCII character.
44     (2) Any code units above the ASCII range are non-ASCII code units.
45         No code units or code unit sequences containing a non-ASCII code unit
46         may represent an ASCII character.  (This property ensures that
47         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
48   EBCDIC-compatible, for the purposes of XML, means that the following property
49         applies.
50     (*) Code units may form all or part of a code unit sequence representing
51         a character in the Unicode range 0 to 0x9F if and only if that code
52         unit has the same interpretation unde the basic EBCDIC code page cp037.
53*/
54
55enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
56/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
57   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
58   The UTF-32/UCS-4 family has 32-bit code units. */
59
60enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
61/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
62   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
63               UTF-16 without a byte order mark,
64               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
65   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
66                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
67   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
68   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
69*/
70
71enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
72/* Possible values depending on the optional standalone component of an
73   XML declaration. */
74
75
76/* Attribute Modeling */
77
78enum ATT_type {CDATA_att, ID_att, IDREF_att, IDREFS_att, ENTITY_att, ENTITIES_att, 
79               NMTOKEN_att, NMTOKENS_att, NOTATION_att, enumeration_att};
80/* Possible attribute types as specified in ATTLIST declarations. */
81
82enum ATT_default_kind {REQUIRED_att, IMPLIED_att, FIXED_att, DEFAULT_att};
83/* Possible kinds of attribute default in ATTLIST declarations. */
84
85struct eqstr
86{
87  bool operator()(const char* s1, const char* s2) const
88  {
89    return strcmp(s1, s2) == 0;
90  }
91};
92
93class ATT_info {
94public:
95        int globalATT_id;
96        ATT_type attType;
97        hash_map<const char *, int, hash<const char *>, eqstr > enumValues; /* For NOTATION_att or enumeration_att.*/
98        ATT_default_kind defaultKind;
99        unsigned char * defaultValue;
100        int defaultValueLgth;
101};
102
103
104class GEntity_info {
105public:
106        int globalGEntity_id;
107        bool is_external;
108        char * ReplacementText;
109        char * systemLiteral;
110        char * pubidLiteral;   
111        char * NDataName;
112        bool is_simple;
113       
114};
115
116class PEntity_info {
117public:
118        int globalPEntity_id;
119        bool is_external;
120        char * ReplacementText;
121        char * systemLiteral;
122        char * pubidLiteral;   
123};
124
125/* The complete Attribute model for a given element is a vector of ATT_info
126   specifications for particular attribute names. */
127//typedef vector<ATT_info> ElementAttributeModel;
128
129
130
131class Entity_Info {
132       
133public: 
134        Entity_Info();
135        ~Entity_Info();
136
137        /*  Information computed by analyzing the 4-byte initial signature
138            of an XML document. */
139        int BOM_units; /* no of initial code units for a Byte Order Mark */
140
141        CodeUnit_Base code_unit_base;
142        CodeUnit_Size code_unit_size;
143        CodeUnit_ByteOrder byte_order; 
144
145        void AnalyzeSignature(unsigned char * signature);
146
147        /* Information computed from the XML or text declaration. */
148        XML_version version;
149        bool has_encoding_decl;
150        unsigned char * encoding;
151        XML_standalone standalone;
152       
153private:
154        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
155};
156
157class Model_Info {
158       
159public: 
160        Model_Info();
161        ~Model_Info();
162   
163        /* Information computed from ATTLIST, ELEMENT, NOTATION and ENTITY declarations. */
164        hash_map<const char *, int, hash<const char *>, eqstr > GlobalAttributeTable;
165        hash_map<const char *, int, hash<const char *>, eqstr > GlobalElementTable;
166        hash_map<const char *, int, hash<const char *>, eqstr > GlobalNotationTable;
167        hash_map<const char *, int, hash<const char *>, eqstr > GlobalGEntityTable;
168        hash_map<const char *, int, hash<const char *>, eqstr > GlobalPEntityTable;
169        int globalElementCount;
170        int globalAttributeCount;
171        int globalNotationCount;
172        int globalGEntityCount;
173        int globalPEntityCount;
174       
175    /* For each element, we have an ElementAttributeModel */
176        vector<vector<ATT_info *> > ElementAttributeData;
177        int getOrInsertGlobalElement(unsigned char * elem_name, int lgth);
178        int getOrInsertGlobalAttName(unsigned char * att_name, int lgth);
179       
180        vector<GEntity_info *> GEntityData;
181        vector<PEntity_info *> PEntityData;
182        void SimpleEntity(char * entity_Name, char * replText);
183};
184
185#endif /*XMLMODEL_H*/
Note: See TracBrowser for help on using the repository browser.