source: trunk/src/bitlex.h @ 59

Last change on this file since 59 was 59, checked in by cameron, 11 years ago

DIGIT_AND_HEX_ITEMS option; ilax.h superceded in engine.h

File size: 4.4 KB
RevLine 
[36]1/*  bitlex.h - parabix lexical analysis (bit streams)
2    Copyright (c) 2007, 2008, Robert D. Cameron.
[4]3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
[36]11#include "xmlparam.h"
12#include "xmlbuffer.h"
[4]13
14/* Lexical items are particular characters, character classes
15   or character sequences significant for XML parsing.  */
16
17enum lexical_item {
[36]18        minLexicalItem = 0,
19        NonWS = minLexicalItem, 
20        MarkupStart, CD_End_check, Hyphen, QMark,
[59]21#ifdef DIGIT_AND_HEX_ITEMS
22        NonDigit, NonHex,
23#endif
[36]24        Quote, NameFollow,
25        maxLexicalItem = NameFollow};
[4]26
[15]27const int LexicalItemCount = maxLexicalItem + 1;
[4]28
[36]29
30
31
[4]32/* The principal role of the lexical analyzer is to prepare
33   a set of parallel data streams for the parsing engine:
34   (a) an XML byte stream and (b) a set of parallel lexical
35   item streams.
36   The XML byte stream consists of one byte for each character
37   code unit in the input stream (typically the input bytes
38   themselves for most 8-bit character sets, or a pseudo-ASCII
39   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
40   The lexical item streams are bit streams that mark with a
41   1 bit the positions of occurrences of each of the lexical
42   items.
43
[36]44   A BitBlockBasis is a set of 8 parallel bit blocks for
45   that represent a block of 8-bit code units in bit-parallel
46   form. */
47
48struct BitBlockBasis {
49        BitBlock bit[8];
50};
51
52/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
53   blocks, followed by a sentinel block to terminate bit scans. */
54
[4]55const int SENTINEL_BLOCKS = 1;
[36]56typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
[4]57
[36]58struct LexicalStreamSet {
59        BitStreamBuffer item_stream[LexicalItemCount];
[4]60};
61
[36]62
63class Lexer_Interface {
64public:
65        Lexer_Interface(XML_Buffer_Interface *b, LexicalStreamSet *l);
[52]66        ~Lexer_Interface();
[36]67        void AdvanceBuffer(int& base_pos, int& rel_pos, int& limit_pos);
68
69protected:
70        XML_Buffer_Interface *xml_buf;
71        void TransposeToBitStreams();
72        virtual void Do_XML_10_WS_Control() = 0;
73        virtual void Do_MarkupStreams() = 0;
74        virtual void Do_XML_11_WS_Control() = 0;
75        virtual void Do_CharsetValidation() = 0;
76        int lexer_base_pos;
77        BitBlockBasis * x8basis;
78        LexicalStreamSet * parsing_engine_data;
[40]79        int buffer_units;
80        int buffer_blocks;
[15]81};
82
[36]83template <CodeUnit_Base C>
84class Lexer : public Lexer_Interface {
[4]85public:
[36]86        static Lexer_Interface * LexerFactory(XML_Buffer_Interface *b, LexicalStreamSet *l);
[4]87
88protected:
[36]89        Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
90        void Do_XML_10_WS_Control();
91        void Do_MarkupStreams();
92        virtual void Do_XML_11_WS_Control() = 0;
93        virtual void Do_CharsetValidation() = 0;
[4]94};
95
[36]96class UTF_8_Lexer : public Lexer<ASCII> {
97public:
98        UTF_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
99        void Do_XML_11_WS_Control();
100        void Do_CharsetValidation();
101};
102
103class ASCII_7_Lexer : public Lexer<ASCII> {
104public:
105        ASCII_7_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
106        void Do_XML_11_WS_Control();
107        void Do_CharsetValidation();
108};
109
110class EASCII_8_Lexer : public Lexer<ASCII> {
111public:
112        EASCII_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
113        void Do_XML_11_WS_Control();
114        void Do_CharsetValidation();
115};
116
117/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
118   Whitespace and control processing is common to these families,
119   but character set validation differs for codepoints D800-DFFF,
120   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
121class U16_Lexer : public Lexer<ASCII> {
122public:
123        U16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
124        void Do_XML_11_WS_Control();
125        virtual void Do_CharsetValidation() = 0;
126};
127
128class UTF_16_Lexer : public U16_Lexer {
129public:
130        UTF_16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
131        void Do_CharsetValidation();
132};
133
134class UCS_2_Lexer : public U16_Lexer {
135public:
136        UCS_2_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
137        void Do_CharsetValidation();
138};
139
140class UTF_32_Lexer : public Lexer<ASCII> {
141public:
142        UTF_32_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
143        void Do_XML_11_WS_Control();
144        void Do_CharsetValidation();
145};
146
147class EBCDIC_Lexer: public Lexer<EBCDIC> {
148public:
149        EBCDIC_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
150        void Do_XML_11_WS_Control();
151        void Do_CharsetValidation();
152};
153
154
[4]155#ifdef BUFFER_PROFILING
156#include "../Profiling/BOM_Profiler.c"
157BOM_Table * bitstream_timer;
158BOM_Table * lextranspose_timer;
159BOM_Table * scanner_timer;
160#endif
161
162
163#endif
Note: See TracBrowser for help on using the repository browser.