source: trunk/src/xmlbuffer.h @ 42

Last change on this file since 42 was 42, checked in by cameron, 11 years ago

lib_simd: refactored allocation, bitstream_scan

File size: 5.2 KB
Line 
1/*  xmlbuffer.h - Input buffering for XML entities.
2    Copyright (c) 2007, 2008,  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef XML_BUFFER_H
9#define XML_BUFFER_H
10
11#include "xmlparam.h"
12#include "../lib/lib_simd.h"
13
14/* The BytePack and the BitBlock are the two fundamental
15   types used by the parabix program for data held in
16   SIMD registers, representing, respectively, the byte-oriented
17   and bit-oriented views of character data.*/
18
19typedef SIMD_type BytePack;
20typedef SIMD_type BitBlock;
21const int PACKSIZE = sizeof(SIMD_type);
22const int BLOCKSIZE = sizeof(SIMD_type) * 8;
23
24/* Define the size of buffer used for lexical analysis/parsing. */
25const int BUFFER_BLOCKS = 1024;
26const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
27
28/* When working near the end of a buffer, a bytespace test may involve
29   a multibyte literal.  The bytespace buffer must always make available
30   a number of lookahead bytes at least equal to the maximum length of any
31   such literal. */
32
33const int LOOKAHEAD_POSITIONS = 16;
34
35
36class XML_Buffer_Interface {
37public:
38        XML_Buffer_Interface ();
39        /* Create and initialize an XML Buffer object for parsing a file,
40           based on autodetection of the character encoding family
41           from the initial 4-byte signature. */
42        static XML_Buffer_Interface * BufferFactory(char * filename);
43        virtual void DoByteplex() = 0;
44        virtual void PreparePseudoASCII_Stream() = 0;
45        virtual void ReadXMLInfo() = 0;
46        virtual void ReadTextDeclaration() = 0;
47        int AvailableUnits(int pos);
48        int ContentStartUnit();
49
50        /* Information returned from the initialization process.
51           See xmlparam.h for types.  */
52        int BOM_units; /* no of initial code units for a Byte Order Mark */
53        XML_version version;
54        bool has_encoding_decl;
55        int encoding_start_pos;
56        int encoding_lgth;
57        CodeUnit_Base code_unit_base;
58        CodeUnit_Size code_unit_size;
59        CodeUnit_ByteOrder byte_order;
60        XML_standalone standalone;
61        int content_start_pos;
62
63        /* Pseudo-ASCII stream. */
64        BytePack * x8data;
65
66protected:
67
68        unsigned int buffer_bytes;
69        unsigned int total_blocks;
70        BytePack * ByteBuffer;
71        int current_unit;
72
73};
74
75template <CodeUnit_Base C>
76class XML_Buffer : public XML_Buffer_Interface {
77public:
78        XML_Buffer (BytePack * src_data, int lgth, int byte_order_units);
79        void ReadXMLInfo();
80        void ReadTextDeclaration();
81        virtual void DoByteplex() = 0;
82        virtual void PreparePseudoASCII_Stream() = 0;
83
84protected:
85
86        void Advance(int n);
87        int AbsPos() const;
88        unsigned char * cur() const;
89
90private:
91        /* Bytespace parsing routines for internal use in ReadXMLInfo and
92           ReadTextDeclaration. */
93        void Scan_WS();
94        void ScanToQuote();
95};
96
97
98/*  Various ASCII based character sets using 8-bit code units are processed
99    using the Extended_ASCII_8_Buffer class.   This includes 7-bit ASCII
100    itself (with high-order bit 0), the ISO-8859 character sets and UTF-8.
101*/
102class Extended_ASCII_8_Buffer : public XML_Buffer<ASCII> {
103public:
104        Extended_ASCII_8_Buffer(BytePack * src, int lgth, int BOM);
105        void DoByteplex();
106        void PreparePseudoASCII_Stream();
107};
108
109/*  The family of 8-bit EBCDIC based character sets are processed using
110    the EBCDIC_Buffer class.
111*/
112class EBCDIC_Buffer : public XML_Buffer<EBCDIC> {
113public:
114        EBCDIC_Buffer(BytePack * src, int lgth, int BOM);
115        void DoByteplex();
116        void PreparePseudoASCII_Stream();
117};
118
119
120
121/*  UTF-16 and UCS-4 character set families in BE and LE byte orders.
122    The U16LE and U16BE subclasses each provide a distinct byteplexer to
123    produce 2 parallel byte streams for the high and low bytes of each
124    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
125    conversion routine can be applied at the U16_Buffer level. */
126
127class U16_Buffer : public XML_Buffer<ASCII> {
128public:
129        U16_Buffer(BytePack * src, int lgth, int BOM);
130        virtual void DoByteplex() = 0;
131        void PreparePseudoASCII_Stream();
132protected:
133        BytePack * x16hi;
134        BytePack * x16lo;
135};
136
137class U16LE_Buffer : public U16_Buffer {
138public:
139        U16LE_Buffer(BytePack * src, int lgth, int BOM);
140        void DoByteplex();
141};
142
143class U16BE_Buffer : public U16_Buffer {
144public:
145        U16BE_Buffer(BytePack * src, int lgth, int BOM);
146        void DoByteplex();
147};
148
149
150/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
151    Each subclass of U32_Buffer provide a distinct byteplexer to
152    produce the 4 parallel byte streams of Unicode data.  Once
153    byteplexing is complete, a generic pseudoASCII routine can
154    be applied. */
155class U32_Buffer : public XML_Buffer<ASCII> {
156public:
157        U32_Buffer(BytePack * src, int lgth, int BOM);
158        virtual void DoByteplex() = 0;
159        void PreparePseudoASCII_Stream();
160protected:
161        BytePack * x32hh;
162        BytePack * x32hl;
163        BytePack * x32lh;
164        BytePack * x32ll;
165};
166
167class U32LE_Buffer : public U32_Buffer {
168public:
169        U32LE_Buffer(BytePack * src, int lgth, int BOM);
170        void DoByteplex();
171};
172
173class U32BE_Buffer : public U32_Buffer {
174public:
175        U32BE_Buffer(BytePack * src, int lgth, int BOM);
176        void DoByteplex();
177};
178
179class U32_2143_Buffer : public U32_Buffer {
180public:
181        U32_2143_Buffer(BytePack * src, int lgth, int BOM);
182        void DoByteplex();
183};
184
185class U32_3412_Buffer : public U32_Buffer {
186public:
187        U32_3412_Buffer(BytePack * src, int lgth, int BOM);
188        void DoByteplex();
189};
190
191
192#endif
Note: See TracBrowser for help on using the repository browser.