source: trunk/src/xmlmodel.h @ 91

Last change on this file since 91 was 91, checked in by cameron, 11 years ago

ATTLIST semantics

File size: 5.4 KB
Line 
1/*  xmlmodel.h - XML Model Processor
2    Copyright (c) 2007, 2008 Robert D. Cameron
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The XML Model Processor gathers information that guides
8    interpretation of an XML document as it is processed.
9    This information arises from a variety of sources,
10    including:
11      (a) the document prolog, including
12          (a1) the encoding signature,
13          (a2) the XML declaration (or text declaration for
14               external entities), and
15          (a3) the Document Type Definition (internal and
16               external subsets).
17      FUTURE:
18      (b) XML Schema documents (and/or Relax NG, Schematron)
19      (c) XPath sets specifying information to retrieve.
20*/
21
22#ifndef XMLMODEL_H
23#define XMLMODEL_H
24#include <vector>
25#include <iostream>
26#include <string>
27#include <ext/hash_map>
28
29using namespace __gnu_cxx;
30using namespace std;
31
32enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
33/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
34   or there may be no XML version declared ("no value" in the
35   XML infoset parlance). */
36
37enum CodeUnit_Base {ASCII, EBCDIC};
38/* Code units of the underlying character set may be either ASCII-compatible
39   or EBCDIC-compatible.
40   ASCII-compatibility means that any code units satisfy the following properties.
41     (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
42         is a complete character sequence (single code unit sequence) representing
43         that ASCII character.
44     (2) Any code units above the ASCII range are non-ASCII code units.
45         No code units or code unit sequences containing a non-ASCII code unit
46         may represent an ASCII character.  (This property ensures that
47         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
48   EBCDIC-compatible, for the purposes of XML, means that the following property
49         applies.
50     (*) Code units may form all or part of a code unit sequence representing
51         a character in the Unicode range 0 to 0x9F if and only if that code
52         unit has the same interpretation unde the basic EBCDIC code page cp037.
53*/
54
55enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
56/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
57   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
58   The UTF-32/UCS-4 family has 32-bit code units. */
59
60enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
61/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
62   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
63               UTF-16 without a byte order mark,
64               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
65   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
66                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
67   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
68   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
69*/
70
71enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
72/* Possible values depending on the optional standalone component of an
73   XML declaration. */
74
75
76/* Attribute Modeling */
77
78enum ATT_type {CDATA_att, ID_att, IDREF_att, IDREFS_att, ENTITY_att, ENTITIES_att, 
79               NMTOKEN_att, NMTOKENS_att, NOTATION_att, enumeration_att};
80/* Possible attribute types as specified in ATTLIST declarations. */
81
82enum ATT_default_kind {REQUIRED_att, IMPLIED_att, FIXED_att, DEFAULT_att};
83/* Possible kinds of attribute default in ATTLIST declarations. */
84
85class ATT_info {
86public:
87        int globalATT_id;
88        ATT_type attType;
89        vector<string> enumValues; /* For NOTATION_att or enumeration_att.*/
90        ATT_default_kind defaultKind;
91        unsigned char * defaultValue;
92        int defaultValueLgth;
93};
94
95/* The complete Attribute model for a given element is a vector of ATT_info
96   specifications for particular attribute names. */
97//typedef vector<ATT_info> ElementAttributeModel;
98
99struct eqstr
100{
101  bool operator()(const char* s1, const char* s2) const
102  {
103    return strcmp(s1, s2) == 0;
104  }
105};
106
107class Model_Info {
108       
109public: 
110        Model_Info();
111        ~Model_Info();
112
113        /*  Information computed by analyzing the 4-byte initial signature
114            of an XML document. */
115        int BOM_units; /* no of initial code units for a Byte Order Mark */
116
117        CodeUnit_Base code_unit_base;
118        CodeUnit_Size code_unit_size;
119        CodeUnit_ByteOrder byte_order; 
120
121        void AnalyzeSignature(unsigned char * signature);
122
123        /* Information computed from the XML or text declaration. */
124        XML_version version;
125        bool has_encoding_decl;
126        unsigned char * encoding;
127        XML_standalone standalone;
128       
129        /* Information computed from ATTLIST declarations. */
130        hash_map<const char *, int, hash<const char *>, eqstr > GlobalAttributeTable;
131        hash_map<const char *, int, hash<const char *>, eqstr > GlobalElementTable;
132        int globalElementCount;
133        int globalAttributeCount;
134       
135    /* For each element, we have an ElementAttributeModel */
136        vector<vector<ATT_info *> > ElementAttributeData;
137        int getOrInsertGlobalElement(unsigned char * elem_name, int lgth);
138        int getOrInsertGlobalAttName(unsigned char * att_name, int lgth);
139private:
140        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
141};
142
143#endif /*XMLMODEL_H*/
Note: See TracBrowser for help on using the repository browser.