source: trunk/src/bitlex.h @ 128

Last change on this file since 128 was 128, checked in by cameron, 11 years ago

Set lexer_base_pos for error reporting.

File size: 4.0 KB
Line 
1/*  bitlex.h - Lexical Item Stream Module.
2    Copyright (c) 2007, 2008, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "xmlmodel.h"
12#include "byteplex.h"
13#include "bitplex.h"
14#include "xmldecl.h"
15
16/* Lexical items are particular characters, character classes
17   or character sequences significant for XML parsing.  */
18
19#define DIGIT_AND_HEX_ITEMS
20
21enum lexical_item {
22        minLexicalItem = 0,
23        NonWS = minLexicalItem, 
24        MarkupStart, CD_End_check, Hyphen, QMark,
25#ifdef DIGIT_AND_HEX_ITEMS
26        NonDigit, NonHex,
27#endif
28        Quote, NameFollow,
29        maxLexicalItem = NameFollow};
30
31const int LexicalItemCount = maxLexicalItem + 1;
32
33
34
35
36/* The principal role of the lexical analyzer is to prepare
37   a set of parallel data streams for the parsing engine:
38   (a) an XML byte stream and (b) a set of parallel lexical
39   item streams.
40   The XML byte stream consists of one byte for each character
41   code unit in the input stream (typically the input bytes
42   themselves for most 8-bit character sets, or a pseudo-ASCII
43   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
44   The lexical item streams are bit streams that mark with a
45   1 bit the positions of occurrences of each of the lexical
46   items.
47
48*/
49
50
51/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
52   blocks, followed by a sentinel block to terminate bit scans. */
53
54const int SENTINEL_BLOCKS = 1;
55typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
56
57struct LexicalStreamSet {
58        BitStreamBuffer item_stream[LexicalItemCount];
59};
60
61
62class Lexer_Interface {
63public:
64        Lexer_Interface(Entity_Info * e, LexicalStreamSet *l);
65        ~Lexer_Interface();
66        void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int buffer_limit_pos);
67
68protected:
69        Entity_Info * entity_Info;
70        virtual void Do_XML_10_WS_Control() = 0;
71        virtual void Do_MarkupStreams() = 0;
72        virtual void Do_XML_11_WS_Control() = 0;
73        virtual void Do_CharsetValidation() = 0;
74        int lexer_base_pos;
75        BitBlockBasis * x8basis;
76        BitBlock * validation_stream;
77        LexicalStreamSet * parsing_engine_data;
78        int buffer_units;
79        int buffer_blocks;
80};
81
82template <CodeUnit_Base C>
83class Lexer : public Lexer_Interface {
84public:
85        static Lexer_Interface * LexerFactory(Entity_Info * e,LexicalStreamSet *l);
86
87protected:
88        Lexer(Entity_Info * e,LexicalStreamSet *l);
89        void Do_XML_10_WS_Control();
90        void Do_MarkupStreams();
91        virtual void Do_XML_11_WS_Control() = 0;
92        virtual void Do_CharsetValidation() = 0;
93};
94
95class UTF_8_Lexer : public Lexer<ASCII> {
96public:
97        UTF_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
98        void Do_XML_11_WS_Control();
99        void Do_CharsetValidation();
100};
101
102class ASCII_7_Lexer : public Lexer<ASCII> {
103public:
104        ASCII_7_Lexer(Entity_Info * e,LexicalStreamSet *l);
105        void Do_XML_11_WS_Control();
106        void Do_CharsetValidation();
107};
108
109class EASCII_8_Lexer : public Lexer<ASCII> {
110public:
111        EASCII_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
112        void Do_XML_11_WS_Control();
113        void Do_CharsetValidation();
114};
115
116/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
117   Whitespace and control processing is common to these families,
118   but character set validation differs for codepoints D800-DFFF,
119   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
120class U16_Lexer : public Lexer<ASCII> {
121public:
122        U16_Lexer(Entity_Info * e,LexicalStreamSet *l);
123        void Do_XML_11_WS_Control();
124        virtual void Do_CharsetValidation() = 0;
125};
126
127class UTF_16_Lexer : public U16_Lexer {
128public:
129        UTF_16_Lexer(Entity_Info * e,LexicalStreamSet *l);
130        void Do_CharsetValidation();
131};
132
133class UCS_2_Lexer : public U16_Lexer {
134public:
135        UCS_2_Lexer(Entity_Info * e,LexicalStreamSet *l);
136        void Do_CharsetValidation();
137};
138
139class UTF_32_Lexer : public Lexer<ASCII> {
140public:
141        UTF_32_Lexer(Entity_Info * e,LexicalStreamSet *l);
142        void Do_XML_11_WS_Control();
143        void Do_CharsetValidation();
144};
145
146class EBCDIC_Lexer: public Lexer<EBCDIC> {
147public:
148        EBCDIC_Lexer(Entity_Info * e,LexicalStreamSet *l);
149        void Do_XML_11_WS_Control();
150        void Do_CharsetValidation();
151};
152
153
154#endif
Note: See TracBrowser for help on using the repository browser.