source: trunk/src/xmlmodel.h @ 106

Last change on this file since 106 was 106, checked in by lindanl, 11 years ago

Content Models.

File size: 6.4 KB
Line 
1/*  xmlmodel.h - XML Model Processor
2    Copyright (c) 2007, 2008 Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The XML Model Processor gathers information that guides
8    interpretation of an XML document as it is processed.
9    This information arises from a variety of sources,
10    including:
11      (a) the document prolog, including
12          (a1) the encoding signature,
13          (a2) the XML declaration (or text declaration for
14               external entities), and
15          (a3) the Document Type Definition (internal and
16               external subsets).
17      FUTURE:
18      (b) XML Schema documents (and/or Relax NG, Schematron)
19      (c) XPath sets specifying information to retrieve.
20*/
21
22#ifndef XMLMODEL_H
23#define XMLMODEL_H
24#include <vector>
25#include <iostream>
26#include <string>
27#include <ext/hash_map>
28
29using namespace __gnu_cxx;
30using namespace std;
31
32#include "contentmodel.h"
33enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
34/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
35   or there may be no XML version declared ("no value" in the
36   XML infoset parlance). */
37
38enum CodeUnit_Base {ASCII, EBCDIC};
39/* Code units of the underlying character set may be either ASCII-compatible
40   or EBCDIC-compatible.
41   ASCII-compatibility means that any code units satisfy the following properties.
42     (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
43         is a complete character sequence (single code unit sequence) representing
44         that ASCII character.
45     (2) Any code units above the ASCII range are non-ASCII code units.
46         No code units or code unit sequences containing a non-ASCII code unit
47         may represent an ASCII character.  (This property ensures that
48         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
49   EBCDIC-compatible, for the purposes of XML, means that the following property
50         applies.
51     (*) Code units may form all or part of a code unit sequence representing
52         a character in the Unicode range 0 to 0x9F if and only if that code
53         unit has the same interpretation unde the basic EBCDIC code page cp037.
54*/
55
56enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
57/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
58   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
59   The UTF-32/UCS-4 family has 32-bit code units. */
60
61enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
62/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
63   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
64               UTF-16 without a byte order mark,
65               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
66   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
67                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
68   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
69   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
70*/
71
72enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
73/* Possible values depending on the optional standalone component of an
74   XML declaration. */
75
76
77/* Attribute Modeling */
78
79enum ATT_type {CDATA_att, ID_att, IDREF_att, IDREFS_att, ENTITY_att, ENTITIES_att, 
80               NMTOKEN_att, NMTOKENS_att, NOTATION_att, enumeration_att};
81/* Possible attribute types as specified in ATTLIST declarations. */
82
83enum ATT_default_kind {REQUIRED_att, IMPLIED_att, FIXED_att, DEFAULT_att};
84/* Possible kinds of attribute default in ATTLIST declarations. */
85
86struct eqstr
87{
88  bool operator()(const char* s1, const char* s2) const
89  {
90    return strcmp(s1, s2) == 0;
91  }
92};
93
94class ATT_info {
95public:
96        int globalATT_id;
97        ATT_type attType;
98        hash_map<const char *, int, hash<const char *>, eqstr > enumValues; /* For NOTATION_att or enumeration_att.*/
99        ATT_default_kind defaultKind;
100        unsigned char * defaultValue;
101        int defaultValueLgth;
102};
103
104
105class GEntity_info {
106public:
107        int globalGEntity_id;
108        bool is_external;
109        char * ReplacementText;
110        char * systemLiteral;
111        char * pubidLiteral;   
112        char * NDataName;
113        bool is_simple;
114       
115};
116
117class PEntity_info {
118public:
119        int globalPEntity_id;
120        bool is_external;
121        char * ReplacementText;
122        char * systemLiteral;
123        char * pubidLiteral;   
124};
125
126/* The complete Attribute model for a given element is a vector of ATT_info
127   specifications for particular attribute names. */
128//typedef vector<ATT_info> ElementAttributeModel;
129
130
131
132class Entity_Info {
133       
134public: 
135        Entity_Info();
136        ~Entity_Info();
137
138        /*  Information computed by analyzing the 4-byte initial signature
139            of an XML document. */
140        int BOM_units; /* no of initial code units for a Byte Order Mark */
141
142        CodeUnit_Base code_unit_base;
143        CodeUnit_Size code_unit_size;
144        CodeUnit_ByteOrder byte_order; 
145
146        void AnalyzeSignature(unsigned char * signature);
147
148        /* Information computed from the XML or text declaration. */
149        XML_version version;
150        bool has_encoding_decl;
151        unsigned char * encoding;
152        XML_standalone standalone;
153       
154private:
155        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
156};
157
158class Model_Info {
159       
160public: 
161        Model_Info();
162        ~Model_Info();
163   
164        /* Information computed from ATTLIST, ELEMENT, NOTATION and ENTITY declarations. */
165        hash_map<const char *, int, hash<const char *>, eqstr > GlobalAttributeTable;
166        hash_map<const char *, int, hash<const char *>, eqstr > GlobalElementTable;
167        hash_map<const char *, int, hash<const char *>, eqstr > GlobalNotationTable;
168        hash_map<const char *, int, hash<const char *>, eqstr > GlobalGEntityTable;
169        hash_map<const char *, int, hash<const char *>, eqstr > GlobalPEntityTable;
170        int globalElementCount;
171        int globalAttributeCount;
172        int globalNotationCount;
173        int globalGEntityCount;
174        int globalPEntityCount;
175       
176    /* For each element, we have an ElementAttributeModel */
177        vector<vector<ATT_info *> > ElementAttributeData;
178        int getOrInsertGlobalElement(unsigned char * elem_name, int lgth);
179        int getOrInsertGlobalAttName(unsigned char * att_name, int lgth);
180        vector<ContentModel *> ContentModelData;
181       
182       
183        vector<GEntity_info *> GEntityData;
184        vector<PEntity_info *> PEntityData;
185        void SimpleEntity(char * entity_Name, char * replText);
186};
187
188#endif /*XMLMODEL_H*/
Note: See TracBrowser for help on using the repository browser.