source: trunk/src/bitlex.h @ 163

Last change on this file since 163 was 163, checked in by cameron, 11 years ago

Restructuring: Document/Externalt? Entity Info into xmldecl.h

File size: 4.0 KB
Line 
1/*  bitlex.h - Lexical Item Stream Module.
2    Copyright (c) 2007, 2008, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "xmldecl.h"
12#include "byteplex.h"
13#include "bitplex.h"
14
15/* Lexical items are particular characters, character classes
16   or character sequences significant for XML parsing.  */
17
18#define DIGIT_AND_HEX_ITEMS
19
20enum lexical_item {
21        minLexicalItem = 0,
22        NonWS = minLexicalItem, 
23        MarkupStart, CD_End_check, Hyphen, QMark,
24#ifdef DIGIT_AND_HEX_ITEMS
25        NonDigit, NonHex,
26#endif
27        Quote, NameFollow,
28        maxLexicalItem = NameFollow};
29
30const int LexicalItemCount = maxLexicalItem + 1;
31
32
33
34
35/* The principal role of the lexical analyzer is to prepare
36   a set of parallel data streams for the parsing engine:
37   (a) an XML byte stream and (b) a set of parallel lexical
38   item streams.
39   The XML byte stream consists of one byte for each character
40   code unit in the input stream (typically the input bytes
41   themselves for most 8-bit character sets, or a pseudo-ASCII
42   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
43   The lexical item streams are bit streams that mark with a
44   1 bit the positions of occurrences of each of the lexical
45   items.
46
47*/
48
49
50/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
51   blocks, followed by a sentinel block to terminate bit scans. */
52
53const int SENTINEL_BLOCKS = 1;
54typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
55
56struct LexicalStreamSet {
57        BitStreamBuffer item_stream[LexicalItemCount];
58};
59
60
61class Lexer_Interface {
62public:
63        Lexer_Interface(Entity_Info * e, LexicalStreamSet *l);
64        ~Lexer_Interface();
65        void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int start_pos, int buffer_limit_pos);
66
67protected:
68        Entity_Info * entity_Info;
69        virtual void Do_XML_10_WS_Control() = 0;
70        virtual void Do_MarkupStreams() = 0;
71        virtual void Do_XML_11_WS_Control() = 0;
72        virtual void Do_CharsetValidation() = 0;
73        int lexer_base_pos;
74        BitBlockBasis * x8basis;
75        BitBlock * validation_stream;
76        LexicalStreamSet * parsing_engine_data;
77        int buffer_units;
78        int buffer_blocks;
79};
80
81template <CodeUnit_Base C>
82class Lexer : public Lexer_Interface {
83public:
84        static Lexer_Interface * LexerFactory(Entity_Info * e,LexicalStreamSet *l);
85
86protected:
87        Lexer(Entity_Info * e,LexicalStreamSet *l);
88        void Do_XML_10_WS_Control();
89        void Do_MarkupStreams();
90        virtual void Do_XML_11_WS_Control() = 0;
91        virtual void Do_CharsetValidation() = 0;
92};
93
94class UTF_8_Lexer : public Lexer<ASCII> {
95public:
96        UTF_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
97        void Do_XML_11_WS_Control();
98        void Do_CharsetValidation();
99};
100
101class ASCII_7_Lexer : public Lexer<ASCII> {
102public:
103        ASCII_7_Lexer(Entity_Info * e,LexicalStreamSet *l);
104        void Do_XML_11_WS_Control();
105        void Do_CharsetValidation();
106};
107
108class EASCII_8_Lexer : public Lexer<ASCII> {
109public:
110        EASCII_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
111        void Do_XML_11_WS_Control();
112        void Do_CharsetValidation();
113};
114
115/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
116   Whitespace and control processing is common to these families,
117   but character set validation differs for codepoints D800-DFFF,
118   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
119class U16_Lexer : public Lexer<ASCII> {
120public:
121        U16_Lexer(Entity_Info * e,LexicalStreamSet *l);
122        void Do_XML_11_WS_Control();
123        virtual void Do_CharsetValidation() = 0;
124};
125
126class UTF_16_Lexer : public U16_Lexer {
127public:
128        UTF_16_Lexer(Entity_Info * e,LexicalStreamSet *l);
129        void Do_CharsetValidation();
130};
131
132class UCS_2_Lexer : public U16_Lexer {
133public:
134        UCS_2_Lexer(Entity_Info * e,LexicalStreamSet *l);
135        void Do_CharsetValidation();
136};
137
138class UTF_32_Lexer : public Lexer<ASCII> {
139public:
140        UTF_32_Lexer(Entity_Info * e,LexicalStreamSet *l);
141        void Do_XML_11_WS_Control();
142        void Do_CharsetValidation();
143};
144
145class EBCDIC_Lexer: public Lexer<EBCDIC> {
146public:
147        EBCDIC_Lexer(Entity_Info * e,LexicalStreamSet *l);
148        void Do_XML_11_WS_Control();
149        void Do_CharsetValidation();
150};
151
152
153#endif
Note: See TracBrowser for help on using the repository browser.