source: trunk/src/bitlex.h @ 52

Last change on this file since 52 was 52, checked in by cameron, 11 years ago

Destructors

File size: 4.3 KB
Line 
1/*  bitlex.h - parabix lexical analysis (bit streams)
2    Copyright (c) 2007, 2008, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "xmlparam.h"
12#include "xmlbuffer.h"
13
14/* Lexical items are particular characters, character classes
15   or character sequences significant for XML parsing.  */
16
17enum lexical_item {
18        minLexicalItem = 0,
19        NonWS = minLexicalItem, 
20        MarkupStart, CD_End_check, Hyphen, QMark,
21        Quote, NameFollow,
22        maxLexicalItem = NameFollow};
23
24const int LexicalItemCount = maxLexicalItem + 1;
25
26
27
28
29/* The principal role of the lexical analyzer is to prepare
30   a set of parallel data streams for the parsing engine:
31   (a) an XML byte stream and (b) a set of parallel lexical
32   item streams.
33   The XML byte stream consists of one byte for each character
34   code unit in the input stream (typically the input bytes
35   themselves for most 8-bit character sets, or a pseudo-ASCII
36   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
37   The lexical item streams are bit streams that mark with a
38   1 bit the positions of occurrences of each of the lexical
39   items.
40
41   A BitBlockBasis is a set of 8 parallel bit blocks for
42   that represent a block of 8-bit code units in bit-parallel
43   form. */
44
45struct BitBlockBasis {
46        BitBlock bit[8];
47};
48
49/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
50   blocks, followed by a sentinel block to terminate bit scans. */
51
52const int SENTINEL_BLOCKS = 1;
53typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
54
55struct LexicalStreamSet {
56        BitStreamBuffer item_stream[LexicalItemCount];
57};
58
59
60class Lexer_Interface {
61public:
62        Lexer_Interface(XML_Buffer_Interface *b, LexicalStreamSet *l);
63        ~Lexer_Interface();
64        void AdvanceBuffer(int& base_pos, int& rel_pos, int& limit_pos);
65
66protected:
67        XML_Buffer_Interface *xml_buf;
68        void TransposeToBitStreams();
69        virtual void Do_XML_10_WS_Control() = 0;
70        virtual void Do_MarkupStreams() = 0;
71        virtual void Do_XML_11_WS_Control() = 0;
72        virtual void Do_CharsetValidation() = 0;
73        int lexer_base_pos;
74        BitBlockBasis * x8basis;
75        LexicalStreamSet * parsing_engine_data;
76        int buffer_units;
77        int buffer_blocks;
78};
79
80template <CodeUnit_Base C>
81class Lexer : public Lexer_Interface {
82public:
83        static Lexer_Interface * LexerFactory(XML_Buffer_Interface *b, LexicalStreamSet *l);
84
85protected:
86        Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
87        void Do_XML_10_WS_Control();
88        void Do_MarkupStreams();
89        virtual void Do_XML_11_WS_Control() = 0;
90        virtual void Do_CharsetValidation() = 0;
91};
92
93class UTF_8_Lexer : public Lexer<ASCII> {
94public:
95        UTF_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
96        void Do_XML_11_WS_Control();
97        void Do_CharsetValidation();
98};
99
100class ASCII_7_Lexer : public Lexer<ASCII> {
101public:
102        ASCII_7_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
103        void Do_XML_11_WS_Control();
104        void Do_CharsetValidation();
105};
106
107class EASCII_8_Lexer : public Lexer<ASCII> {
108public:
109        EASCII_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
110        void Do_XML_11_WS_Control();
111        void Do_CharsetValidation();
112};
113
114/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
115   Whitespace and control processing is common to these families,
116   but character set validation differs for codepoints D800-DFFF,
117   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
118class U16_Lexer : public Lexer<ASCII> {
119public:
120        U16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
121        void Do_XML_11_WS_Control();
122        virtual void Do_CharsetValidation() = 0;
123};
124
125class UTF_16_Lexer : public U16_Lexer {
126public:
127        UTF_16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
128        void Do_CharsetValidation();
129};
130
131class UCS_2_Lexer : public U16_Lexer {
132public:
133        UCS_2_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
134        void Do_CharsetValidation();
135};
136
137class UTF_32_Lexer : public Lexer<ASCII> {
138public:
139        UTF_32_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
140        void Do_XML_11_WS_Control();
141        void Do_CharsetValidation();
142};
143
144class EBCDIC_Lexer: public Lexer<EBCDIC> {
145public:
146        EBCDIC_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
147        void Do_XML_11_WS_Control();
148        void Do_CharsetValidation();
149};
150
151
152#ifdef BUFFER_PROFILING
153#include "../Profiling/BOM_Profiler.c"
154BOM_Table * bitstream_timer;
155BOM_Table * lextranspose_timer;
156BOM_Table * scanner_timer;
157#endif
158
159
160#endif
Note: See TracBrowser for help on using the repository browser.