source: trunk/src/xmlbuffer.h @ 65

Last change on this file since 65 was 65, checked in by cameron, 11 years ago

Add CODE_CLOCKING instrumentation.

File size: 5.3 KB
Line 
1/*  xmlbuffer.h - Input buffering for XML entities.
2    Copyright (c) 2007, 2008,  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef XML_BUFFER_H
9#define XML_BUFFER_H
10
11#include "xmlparam.h"
12#include "../lib/lib_simd.h"
13
14/* The BytePack and the BitBlock are the two fundamental
15   types used by the parabix program for data held in
16   SIMD registers, representing, respectively, the byte-oriented
17   and bit-oriented views of character data.*/
18
19typedef SIMD_type BytePack;
20typedef SIMD_type BitBlock;
21const int PACKSIZE = sizeof(SIMD_type);
22const int BLOCKSIZE = sizeof(SIMD_type) * 8;
23
24/* Define the size of buffer used for lexical analysis/parsing. */
25const int BUFFER_BLOCKS = 1024;
26const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
27
28/* When working near the end of a buffer, a bytespace test may involve
29   a multibyte literal.  The bytespace buffer must always make available
30   a number of lookahead bytes at least equal to the maximum length of any
31   such literal. */
32
33const int LOOKAHEAD_POSITIONS = 16;
34
35
36class XML_Buffer_Interface {
37public:
38        XML_Buffer_Interface ();
39        ~XML_Buffer_Interface ();
40        /* Create and initialize an XML Buffer object for parsing a file,
41           based on autodetection of the character encoding family
42           from the initial 4-byte signature. */
43        static XML_Buffer_Interface * BufferFactory(char * filename);
44        virtual void DoByteplex() = 0;
45        virtual void PreparePseudoASCII_Stream() = 0;
46        virtual void ReadXMLInfo() = 0;
47        virtual void ReadTextDeclaration() = 0;
48        int AvailableUnits(int pos);
49        int ContentStartUnit();
50
51        /* Information returned from the initialization process.
52           See xmlparam.h for types.  */
53        int BOM_units; /* no of initial code units for a Byte Order Mark */
54        XML_version version;
55        bool has_encoding_decl;
56        int encoding_start_pos;
57        int encoding_lgth;
58        CodeUnit_Base code_unit_base;
59        CodeUnit_Size code_unit_size;
60        CodeUnit_ByteOrder byte_order;
61        XML_standalone standalone;
62        int content_start_pos;
63
64        /* Pseudo-ASCII stream. */
65        BytePack * x8data;
66
67        BytePack * ByteBuffer;
68
69protected:
70
71        unsigned int buffer_bytes;
72        unsigned int total_blocks;
73        int current_unit;
74
75};
76
77template <CodeUnit_Base C>
78class XML_Buffer : public XML_Buffer_Interface {
79public:
80        XML_Buffer (BytePack * src_data, int lgth, int byte_order_units);
81        void ReadXMLInfo();
82        void ReadTextDeclaration();
83        virtual void DoByteplex() = 0;
84        virtual void PreparePseudoASCII_Stream() = 0;
85
86protected:
87
88        void Advance(int n);
89        int AbsPos() const;
90        unsigned char * cur() const;
91
92private:
93        /* Bytespace parsing routines for internal use in ReadXMLInfo and
94           ReadTextDeclaration. */
95        void Scan_WS();
96        void ScanToQuote();
97};
98
99
100/*  Various ASCII based character sets using 8-bit code units are processed
101    using the Extended_ASCII_8_Buffer class.   This includes 7-bit ASCII
102    itself (with high-order bit 0), the ISO-8859 character sets and UTF-8.
103*/
104class Extended_ASCII_8_Buffer : public XML_Buffer<ASCII> {
105public:
106        Extended_ASCII_8_Buffer(BytePack * src, int lgth, int BOM);
107        void DoByteplex();
108        void PreparePseudoASCII_Stream();
109};
110
111/*  The family of 8-bit EBCDIC based character sets are processed using
112    the EBCDIC_Buffer class.
113*/
114class EBCDIC_Buffer : public XML_Buffer<EBCDIC> {
115public:
116        EBCDIC_Buffer(BytePack * src, int lgth, int BOM);
117        void DoByteplex();
118        void PreparePseudoASCII_Stream();
119};
120
121
122
123/*  UTF-16 and UCS-4 character set families in BE and LE byte orders.
124    The U16LE and U16BE subclasses each provide a distinct byteplexer to
125    produce 2 parallel byte streams for the high and low bytes of each
126    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
127    conversion routine can be applied at the U16_Buffer level. */
128
129class U16_Buffer : public XML_Buffer<ASCII> {
130public:
131        U16_Buffer(BytePack * src, int lgth, int BOM);
132        ~U16_Buffer();
133        virtual void DoByteplex() = 0;
134        void PreparePseudoASCII_Stream();
135protected:
136        BytePack * x16hi;
137        BytePack * x16lo;
138};
139
140class U16LE_Buffer : public U16_Buffer {
141public:
142        U16LE_Buffer(BytePack * src, int lgth, int BOM);
143        void DoByteplex();
144};
145
146class U16BE_Buffer : public U16_Buffer {
147public:
148        U16BE_Buffer(BytePack * src, int lgth, int BOM);
149        void DoByteplex();
150};
151
152
153/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
154    Each subclass of U32_Buffer provide a distinct byteplexer to
155    produce the 4 parallel byte streams of Unicode data.  Once
156    byteplexing is complete, a generic pseudoASCII routine can
157    be applied. */
158class U32_Buffer : public XML_Buffer<ASCII> {
159public:
160        U32_Buffer(BytePack * src, int lgth, int BOM);
161        ~U32_Buffer();
162        virtual void DoByteplex() = 0;
163        void PreparePseudoASCII_Stream();
164protected:
165        BytePack * x32hh;
166        BytePack * x32hl;
167        BytePack * x32lh;
168        BytePack * x32ll;
169};
170
171class U32LE_Buffer : public U32_Buffer {
172public:
173        U32LE_Buffer(BytePack * src, int lgth, int BOM);
174        void DoByteplex();
175};
176
177class U32BE_Buffer : public U32_Buffer {
178public:
179        U32BE_Buffer(BytePack * src, int lgth, int BOM);
180        void DoByteplex();
181};
182
183class U32_2143_Buffer : public U32_Buffer {
184public:
185        U32_2143_Buffer(BytePack * src, int lgth, int BOM);
186        void DoByteplex();
187};
188
189class U32_3412_Buffer : public U32_Buffer {
190public:
191        U32_3412_Buffer(BytePack * src, int lgth, int BOM);
192        void DoByteplex();
193};
194
195
196#endif
Note: See TracBrowser for help on using the repository browser.