source: trunk/src/bitlex.h @ 1463

Last change on this file since 1463 was 186, checked in by cameron, 11 years ago

The [&#/] stream for MarkupSort?.

File size: 4.1 KB
Line 
1/*  bitlex.h - Lexical Item Stream Module.
2    Copyright (c) 2007, 2008, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "xmldecl.h"
12#include "byteplex.h"
13#include "bitplex.h"
14
15/* Lexical items are particular characters, character classes
16   or character sequences significant for XML parsing.  */
17
18#define DIGIT_AND_HEX_ITEMS
19
20enum lexical_item {
21        minLexicalItem = 0,
22        NonWS = minLexicalItem, 
23        MarkupStart, CD_End_check, Hyphen, QMark,
24#ifdef MARKUP_SORTING
25        AmpHashSlash, /* The [&#/] stream */
26#endif
27#ifdef DIGIT_AND_HEX_ITEMS
28        NonDigit, NonHex,
29#endif
30        Quote, NameFollow,
31        maxLexicalItem = NameFollow};
32
33const int LexicalItemCount = maxLexicalItem + 1;
34
35
36
37
38/* The principal role of the lexical analyzer is to prepare
39   a set of parallel data streams for the parsing engine:
40   (a) an XML byte stream and (b) a set of parallel lexical
41   item streams.
42   The XML byte stream consists of one byte for each character
43   code unit in the input stream (typically the input bytes
44   themselves for most 8-bit character sets, or a pseudo-ASCII
45   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
46   The lexical item streams are bit streams that mark with a
47   1 bit the positions of occurrences of each of the lexical
48   items.
49
50*/
51
52
53/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
54   blocks, followed by a sentinel block to terminate bit scans. */
55
56const int SENTINEL_BLOCKS = 1;
57typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
58
59struct LexicalStreamSet {
60        BitStreamBuffer item_stream[LexicalItemCount];
61};
62
63
64class Lexer_Interface {
65public:
66        Lexer_Interface(Entity_Info * e, LexicalStreamSet *l);
67        ~Lexer_Interface();
68        void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int start_pos, int buffer_limit_pos);
69
70protected:
71        Entity_Info * entity_Info;
72        virtual void Do_XML_10_WS_Control() = 0;
73        virtual void Do_MarkupStreams() = 0;
74        virtual void Do_XML_11_WS_Control() = 0;
75        virtual void Do_CharsetValidation() = 0;
76        int lexer_base_pos;
77        BitBlockBasis * x8basis;
78        BitBlock * validation_stream;
79        LexicalStreamSet * parsing_engine_data;
80        int buffer_units;
81        int buffer_blocks;
82};
83
84template <CodeUnit_Base C>
85class Lexer : public Lexer_Interface {
86public:
87        static Lexer_Interface * LexerFactory(Entity_Info * e,LexicalStreamSet *l);
88
89protected:
90        Lexer(Entity_Info * e,LexicalStreamSet *l);
91        void Do_XML_10_WS_Control();
92        void Do_MarkupStreams();
93        virtual void Do_XML_11_WS_Control() = 0;
94        virtual void Do_CharsetValidation() = 0;
95};
96
97class UTF_8_Lexer : public Lexer<ASCII> {
98public:
99        UTF_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
100        void Do_XML_11_WS_Control();
101        void Do_CharsetValidation();
102};
103
104class ASCII_7_Lexer : public Lexer<ASCII> {
105public:
106        ASCII_7_Lexer(Entity_Info * e,LexicalStreamSet *l);
107        void Do_XML_11_WS_Control();
108        void Do_CharsetValidation();
109};
110
111class EASCII_8_Lexer : public Lexer<ASCII> {
112public:
113        EASCII_8_Lexer(Entity_Info * e,LexicalStreamSet *l);
114        void Do_XML_11_WS_Control();
115        void Do_CharsetValidation();
116};
117
118/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
119   Whitespace and control processing is common to these families,
120   but character set validation differs for codepoints D800-DFFF,
121   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
122class U16_Lexer : public Lexer<ASCII> {
123public:
124        U16_Lexer(Entity_Info * e,LexicalStreamSet *l);
125        void Do_XML_11_WS_Control();
126        virtual void Do_CharsetValidation() = 0;
127};
128
129class UTF_16_Lexer : public U16_Lexer {
130public:
131        UTF_16_Lexer(Entity_Info * e,LexicalStreamSet *l);
132        void Do_CharsetValidation();
133};
134
135class UCS_2_Lexer : public U16_Lexer {
136public:
137        UCS_2_Lexer(Entity_Info * e,LexicalStreamSet *l);
138        void Do_CharsetValidation();
139};
140
141class UTF_32_Lexer : public Lexer<ASCII> {
142public:
143        UTF_32_Lexer(Entity_Info * e,LexicalStreamSet *l);
144        void Do_XML_11_WS_Control();
145        void Do_CharsetValidation();
146};
147
148class EBCDIC_Lexer: public Lexer<EBCDIC> {
149public:
150        EBCDIC_Lexer(Entity_Info * e,LexicalStreamSet *l);
151        void Do_XML_11_WS_Control();
152        void Do_CharsetValidation();
153};
154
155
156#endif
Note: See TracBrowser for help on using the repository browser.