source: trunk/src/bitlex.h @ 36

Last change on this file since 36 was 36, checked in by cameron, 12 years ago

Charset Architecture: Lexer Factory/bit streams

File size: 4.3 KB
RevLine 
[36]1/*  bitlex.h - parabix lexical analysis (bit streams)
2    Copyright (c) 2007, 2008, Robert D. Cameron.
[4]3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
[36]11#include "xmlparam.h"
12#include "xmlbuffer.h"
[4]13
14/* Lexical items are particular characters, character classes
15   or character sequences significant for XML parsing.  */
16
17enum lexical_item {
[36]18        minLexicalItem = 0,
19        NonWS = minLexicalItem, 
20        MarkupStart, CD_End_check, Hyphen, QMark,
21        Quote, NameFollow,
22        maxLexicalItem = NameFollow};
[4]23
[15]24const int LexicalItemCount = maxLexicalItem + 1;
[4]25
[36]26
27
28
[4]29/* The principal role of the lexical analyzer is to prepare
30   a set of parallel data streams for the parsing engine:
31   (a) an XML byte stream and (b) a set of parallel lexical
32   item streams.
33   The XML byte stream consists of one byte for each character
34   code unit in the input stream (typically the input bytes
35   themselves for most 8-bit character sets, or a pseudo-ASCII
36   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
37   The lexical item streams are bit streams that mark with a
38   1 bit the positions of occurrences of each of the lexical
39   items.
40
[36]41   A BitBlockBasis is a set of 8 parallel bit blocks for
42   that represent a block of 8-bit code units in bit-parallel
43   form. */
44
45struct BitBlockBasis {
46        BitBlock bit[8];
47};
48
49/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
50   blocks, followed by a sentinel block to terminate bit scans. */
51
[4]52const int SENTINEL_BLOCKS = 1;
[36]53typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
[4]54
[36]55struct LexicalStreamSet {
56        BitStreamBuffer item_stream[LexicalItemCount];
[4]57};
58
[36]59
60class Lexer_Interface {
61public:
62        Lexer_Interface(XML_Buffer_Interface *b, LexicalStreamSet *l);
63        void AdvanceBuffer(int& base_pos, int& rel_pos, int& limit_pos);
64
65protected:
66        XML_Buffer_Interface *xml_buf;
67        void TransposeToBitStreams();
68        virtual void Do_XML_10_WS_Control() = 0;
69        virtual void Do_MarkupStreams() = 0;
70        virtual void Do_XML_11_WS_Control() = 0;
71        virtual void Do_CharsetValidation() = 0;
72        int lexer_base_pos;
73        BitBlockBasis * x8basis;
74        LexicalStreamSet * parsing_engine_data;
75        int code_units;
76        int data_blocks;
[15]77};
78
[36]79template <CodeUnit_Base C>
80class Lexer : public Lexer_Interface {
[4]81public:
[36]82        static Lexer_Interface * LexerFactory(XML_Buffer_Interface *b, LexicalStreamSet *l);
[4]83
84protected:
[36]85        Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
86        void Do_XML_10_WS_Control();
87        void Do_MarkupStreams();
88        virtual void Do_XML_11_WS_Control() = 0;
89        virtual void Do_CharsetValidation() = 0;
[4]90};
91
[36]92class UTF_8_Lexer : public Lexer<ASCII> {
93public:
94        UTF_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
95        void Do_XML_11_WS_Control();
96        void Do_CharsetValidation();
97};
98
99class ASCII_7_Lexer : public Lexer<ASCII> {
100public:
101        ASCII_7_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
102        void Do_XML_11_WS_Control();
103        void Do_CharsetValidation();
104};
105
106class EASCII_8_Lexer : public Lexer<ASCII> {
107public:
108        EASCII_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
109        void Do_XML_11_WS_Control();
110        void Do_CharsetValidation();
111};
112
113/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
114   Whitespace and control processing is common to these families,
115   but character set validation differs for codepoints D800-DFFF,
116   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
117class U16_Lexer : public Lexer<ASCII> {
118public:
119        U16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
120        void Do_XML_11_WS_Control();
121        virtual void Do_CharsetValidation() = 0;
122};
123
124class UTF_16_Lexer : public U16_Lexer {
125public:
126        UTF_16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
127        void Do_CharsetValidation();
128};
129
130class UCS_2_Lexer : public U16_Lexer {
131public:
132        UCS_2_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
133        void Do_CharsetValidation();
134};
135
136class UTF_32_Lexer : public Lexer<ASCII> {
137public:
138        UTF_32_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
139        void Do_XML_11_WS_Control();
140        void Do_CharsetValidation();
141};
142
143class EBCDIC_Lexer: public Lexer<EBCDIC> {
144public:
145        EBCDIC_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
146        void Do_XML_11_WS_Control();
147        void Do_CharsetValidation();
148};
149
150
[4]151#ifdef BUFFER_PROFILING
152#include "../Profiling/BOM_Profiler.c"
153BOM_Table * bitstream_timer;
154BOM_Table * lextranspose_timer;
155BOM_Table * scanner_timer;
156#endif
157
158
159#endif
Note: See TracBrowser for help on using the repository browser.