source: trunk/src/bitlex.h @ 87

Last change on this file since 87 was 72, checked in by cameron, 11 years ago

Refactored Lexer and friends

File size: 4.0 KB
Line 
1/*  bitlex.h - Lexical Item Stream Module.
2    Copyright (c) 2007, 2008, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "xmlmodel.h"
12#include "byteplex.h"
13#include "bitplex.h"
14#include "xmldecl.h"
15
16/* Lexical items are particular characters, character classes
17   or character sequences significant for XML parsing.  */
18
19enum lexical_item {
20        minLexicalItem = 0,
21        NonWS = minLexicalItem, 
22        MarkupStart, CD_End_check, Hyphen, QMark,
23#ifdef DIGIT_AND_HEX_ITEMS
24        NonDigit, NonHex,
25#endif
26        Quote, NameFollow,
27        maxLexicalItem = NameFollow};
28
29const int LexicalItemCount = maxLexicalItem + 1;
30
31
32
33
34/* The principal role of the lexical analyzer is to prepare
35   a set of parallel data streams for the parsing engine:
36   (a) an XML byte stream and (b) a set of parallel lexical
37   item streams.
38   The XML byte stream consists of one byte for each character
39   code unit in the input stream (typically the input bytes
40   themselves for most 8-bit character sets, or a pseudo-ASCII
41   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
42   The lexical item streams are bit streams that mark with a
43   1 bit the positions of occurrences of each of the lexical
44   items.
45
46*/
47
48
49/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
50   blocks, followed by a sentinel block to terminate bit scans. */
51
52const int SENTINEL_BLOCKS = 1;
53typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
54
55struct LexicalStreamSet {
56        BitStreamBuffer item_stream[LexicalItemCount];
57};
58
59
60class Lexer_Interface {
61public:
62        Lexer_Interface(Model_Info * m, LexicalStreamSet *l);
63        ~Lexer_Interface();
64        void AnalyzeBuffer(BitBlockBasis * x8basis, int buffer_limit_pos);
65
66protected:
67        Model_Info * model_info;
68        virtual void Do_XML_10_WS_Control() = 0;
69        virtual void Do_MarkupStreams() = 0;
70        virtual void Do_XML_11_WS_Control() = 0;
71        virtual void Do_CharsetValidation() = 0;
72        int lexer_base_pos;
73        BitBlockBasis * x8basis;
74        BitBlock * validation_stream;
75        LexicalStreamSet * parsing_engine_data;
76        int buffer_units;
77        int buffer_blocks;
78};
79
80template <CodeUnit_Base C>
81class Lexer : public Lexer_Interface {
82public:
83        static Lexer_Interface * LexerFactory(Model_Info * m,LexicalStreamSet *l);
84
85protected:
86        Lexer(Model_Info * m,LexicalStreamSet *l);
87        void Do_XML_10_WS_Control();
88        void Do_MarkupStreams();
89        virtual void Do_XML_11_WS_Control() = 0;
90        virtual void Do_CharsetValidation() = 0;
91};
92
93class UTF_8_Lexer : public Lexer<ASCII> {
94public:
95        UTF_8_Lexer(Model_Info * m,LexicalStreamSet *l);
96        void Do_XML_11_WS_Control();
97        void Do_CharsetValidation();
98};
99
100class ASCII_7_Lexer : public Lexer<ASCII> {
101public:
102        ASCII_7_Lexer(Model_Info * m,LexicalStreamSet *l);
103        void Do_XML_11_WS_Control();
104        void Do_CharsetValidation();
105};
106
107class EASCII_8_Lexer : public Lexer<ASCII> {
108public:
109        EASCII_8_Lexer(Model_Info * m,LexicalStreamSet *l);
110        void Do_XML_11_WS_Control();
111        void Do_CharsetValidation();
112};
113
114/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
115   Whitespace and control processing is common to these families,
116   but character set validation differs for codepoints D800-DFFF,
117   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
118class U16_Lexer : public Lexer<ASCII> {
119public:
120        U16_Lexer(Model_Info * m,LexicalStreamSet *l);
121        void Do_XML_11_WS_Control();
122        virtual void Do_CharsetValidation() = 0;
123};
124
125class UTF_16_Lexer : public U16_Lexer {
126public:
127        UTF_16_Lexer(Model_Info * m,LexicalStreamSet *l);
128        void Do_CharsetValidation();
129};
130
131class UCS_2_Lexer : public U16_Lexer {
132public:
133        UCS_2_Lexer(Model_Info * m,LexicalStreamSet *l);
134        void Do_CharsetValidation();
135};
136
137class UTF_32_Lexer : public Lexer<ASCII> {
138public:
139        UTF_32_Lexer(Model_Info * m,LexicalStreamSet *l);
140        void Do_XML_11_WS_Control();
141        void Do_CharsetValidation();
142};
143
144class EBCDIC_Lexer: public Lexer<EBCDIC> {
145public:
146        EBCDIC_Lexer(Model_Info * m,LexicalStreamSet *l);
147        void Do_XML_11_WS_Control();
148        void Do_CharsetValidation();
149};
150
151
152#endif
Note: See TracBrowser for help on using the repository browser.