source: trunk/src/xmlbuffer.h @ 35

Last change on this file since 35 was 35, checked in by cameron, 11 years ago

Charset Architecture: Byteplexing Buffer Factory

File size: 5.3 KB
RevLine 
[4]1/*  xmlbuffer.h - Input buffering for XML entities.
[35]2    Copyright (c) 2007, 2008,  Robert D. Cameron.
[4]3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef XML_BUFFER_H
9#define XML_BUFFER_H
10
[35]11#include "xmlparam.h"
[4]12
[35]13#ifdef __i386
14#include "../lib/sse_simd.h"
15#endif
16#ifdef _ARCH_PPC
17#include "../lib/altivec_simd.h"
18#endif
[4]19
20
[35]21/* The BytePack and the BitBlock are the two fundamental
22   types used by the parabix program for data held in
23   SIMD registers, representing, respectively, the byte-oriented
24   and bit-oriented views of character data.*/
25
26typedef SIMD_type BytePack;
27typedef SIMD_type BitBlock;
28const int PACKSIZE = sizeof(SIMD_type);
29const int BLOCKSIZE = sizeof(SIMD_type) * 8;
30
31/* Define the size of buffer used for lexical analysis/parsing. */
32const int BUFFER_BLOCKS = 1024;
33const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
34
35/* When working near the end of a buffer, a bytespace test may involve
36   a multibyte literal.  The bytespace buffer must always make available
37   a number of lookahead bytes at least equal to the maximum length of any
38   such literal. */
39
40const int LOOKAHEAD_POSITIONS = 16;
41
42
43class XML_Buffer_Interface {
[4]44public:
[35]45        XML_Buffer_Interface ();
46        /* Create and initialize an XML Buffer object for parsing a file,
47           based on autodetection of the character encoding family
48           from the initial 4-byte signature. */
49        static XML_Buffer_Interface * BufferFactory(char * filename);
50        virtual void DoByteplex() = 0;
51        virtual void PreparePseudoASCII_Stream() = 0;
52        virtual void ReadXMLInfo() = 0;
53        virtual void ReadTextDeclaration() = 0;
54        int AvailableUnits(int pos);
55        int ContentStartUnit();
56
57        /* Information returned from the initialization process.
58           See xmlparam.h for types.  */
59        int BOM_units; /* no of initial code units for a Byte Order Mark */
60        XML_version version;
61        bool has_encoding_decl;
62        int encoding_start_pos;
63        int encoding_lgth;
64        CodeUnit_Base code_unit_base;
65        CodeUnit_Size code_unit_size;
66        CodeUnit_ByteOrder byte_order;
67        XML_standalone standalone;
68        int content_start_pos;
69
70        /* Pseudo-ASCII stream. */
71        BytePack * x8data;
72
73protected:
74
[4]75        unsigned int buffer_bytes;
[35]76        unsigned int total_blocks;
77        BytePack * ByteBuffer;
78        int current_unit;
[4]79
80};
81
[35]82template <CodeUnit_Base C>
83class XML_Buffer : public XML_Buffer_Interface {
84public:
85        XML_Buffer (BytePack * src_data, int lgth, int byte_order_units);
86        void ReadXMLInfo();
87        void ReadTextDeclaration();
88        virtual void DoByteplex() = 0;
89        virtual void PreparePseudoASCII_Stream() = 0;
90
91protected:
92
93        void Advance(int n);
94        int AbsPos() const;
95        unsigned char * cur() const;
96
97private:
98        /* Bytespace parsing routines for internal use in ReadXMLInfo and
99           ReadTextDeclaration. */
100        void Scan_WS();
101        void ScanToQuote();
102};
103
104
105/*  Various ASCII based character sets using 8-bit code units are processed
106    using the Extended_ASCII_8_Buffer class.   This includes 7-bit ASCII
107    itself (with high-order bit 0), the ISO-8859 character sets and UTF-8.
108*/
109class Extended_ASCII_8_Buffer : public XML_Buffer<ASCII> {
110public:
111        Extended_ASCII_8_Buffer(BytePack * src, int lgth, int BOM);
112        void DoByteplex();
113        void PreparePseudoASCII_Stream();
114};
115
116/*  The family of 8-bit EBCDIC based character sets are processed using
117    the EBCDIC_Buffer class.
118*/
119class EBCDIC_Buffer : public XML_Buffer<EBCDIC> {
120public:
121        EBCDIC_Buffer(BytePack * src, int lgth, int BOM);
122        void DoByteplex();
123        void PreparePseudoASCII_Stream();
124};
125
126
127
128/*  UTF-16 and UCS-4 character set families in BE and LE byte orders.
129    The U16LE and U16BE subclasses each provide a distinct byteplexer to
130    produce 2 parallel byte streams for the high and low bytes of each
131    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
132    conversion routine can be applied at the U16_Buffer level. */
133
134class U16_Buffer : public XML_Buffer<ASCII> {
135public:
136        U16_Buffer(BytePack * src, int lgth, int BOM);
137        virtual void DoByteplex() = 0;
138        void PreparePseudoASCII_Stream();
139protected:
140        BytePack * x16hi;
141        BytePack * x16lo;
142};
143
144class U16LE_Buffer : public U16_Buffer {
145public:
146        U16LE_Buffer(BytePack * src, int lgth, int BOM);
147        void DoByteplex();
148};
149
150class U16BE_Buffer : public U16_Buffer {
151public:
152        U16BE_Buffer(BytePack * src, int lgth, int BOM);
153        void DoByteplex();
154};
155
156
157/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
158    Each subclass of U32_Buffer provide a distinct byteplexer to
159    produce the 4 parallel byte streams of Unicode data.  Once
160    byteplexing is complete, a generic pseudoASCII routine can
161    be applied. */
162class U32_Buffer : public XML_Buffer<ASCII> {
163public:
164        U32_Buffer(BytePack * src, int lgth, int BOM);
165        virtual void DoByteplex() = 0;
166        void PreparePseudoASCII_Stream();
167protected:
168        BytePack * x32hh;
169        BytePack * x32hl;
170        BytePack * x32lh;
171        BytePack * x32ll;
172};
173
174class U32LE_Buffer : public U32_Buffer {
175public:
176        U32LE_Buffer(BytePack * src, int lgth, int BOM);
177        void DoByteplex();
178};
179
180class U32BE_Buffer : public U32_Buffer {
181public:
182        U32BE_Buffer(BytePack * src, int lgth, int BOM);
183        void DoByteplex();
184};
185
186class U32_2143_Buffer : public U32_Buffer {
187public:
188        U32_2143_Buffer(BytePack * src, int lgth, int BOM);
189        void DoByteplex();
190};
191
192class U32_3412_Buffer : public U32_Buffer {
193public:
194        U32_3412_Buffer(BytePack * src, int lgth, int BOM);
195        void DoByteplex();
196};
197
198
[4]199#endif
Note: See TracBrowser for help on using the repository browser.