source: trunk/src/byteplex.h @ 4277

Last change on this file since 4277 was 1476, checked in by ksherdy, 8 years ago

Centralized typedefs.

File size: 7.6 KB
Line 
1/*  byteplex.h - Parallel byte stream module.
2    Copyright (c) 2008,  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    This module has as its goal the buffering of XML byte data and
8    transformation of 16-bit and 32-bit code unit data so that the
9    parsing engine is provided a uniform representation based on
10    the concept of an 8-bit pseudo-ASCII representation (x8data).
11
12    A Byteplex object provides buffers for one to six parallel data
13    streams based for an XML input entity, depending on the size of
14    character code units. 
15       1.  In the case of 8-bit code units, a single byte stream
16           consisting of unmodified input data is maintained.
17           x8data = src_buffer
18       2.  In the case of 16-bit code units (UTF-16 and UCS-2 families),
19             (a) the original code unit stream is maintained unmodified,
20             (b) the x16hi byte stream is established for the high byte
21                 of each code unit,
22             (c) the x16lo byte stream is established for the low byte
23                 of each code unit, and
24             (d) x8data is established as the pseudo-ASCII byte stream,
25                 with ASCII code units having their proper 8-bit values,
26                 and all others having bit 0 set to 1.
27       3.  In the case of 32-bit code units (UTF-32 family),
28             (a) the original code unit stream is maintained unmodified,
29             (b) the x32hh byte stream has high bytes of each code unit
30             (c) the x32hl byte stream has second bytes of each code unit
31             (d) the x32lh byte stream has third bytes of each code unit
32             (e) the x32hh byte stream has low bytes of each code unit, and
33             (f) x8data is established as the pseudo-ASCII byte stream,
34                 with ASCII code units having their proper 8-bit values,
35                 and all others having bit 0 set to 1.
36
37    The pseudo-ASCII representation is defined for both ASCII-based
38    and EBCDIC-based character sets such that all characters in
39    the ASCII repertoire (i.e., having Unicode code points from 0x00
40    to 0x7F), are represented as themselves and no non-ASCII character
41    is represented as a character in the ASCII repertoire.
42
43*/
44
45#ifndef BYTEPLEX_H
46#define BYTEPLEX_H
47
48#include "xmldecl.h"
49#include "../lib/lib_simd.h"
50
51/* The BytePack and the BitBlock are the two fundamental
52   types used by the parabix program for data held in
53   SIMD registers, representing, respectively, the byte-oriented
54   and bit-oriented views of character data.*/
55
56const int PACKSIZE = sizeof(SIMD_type);
57const int BLOCKSIZE = sizeof(SIMD_type) * 8;
58
59/* Define the size of buffer used for lexical analysis/parsing. */
60const int BUFFER_BLOCKS = 781;
61const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
62
63/* When working near the end of a buffer, a bytespace test may involve
64   a multibyte literal.  The bytespace buffer must always make available
65   a number of lookahead bytes at least equal to the maximum length of any
66   such literal. */
67
68const int LOOKAHEAD_POSITIONS = 16;
69const int BYTEPLEX_SIZE = BUFFER_SIZE + LOOKAHEAD_POSITIONS;
70
71class Byteplex {
72public:
73        virtual ~Byteplex();
74        static Byteplex * ByteplexFactory(Entity_Info * e);
75        static Byteplex * ByteplexFactory(Entity_Info * e, FILE * inputfile);
76        static Byteplex * ByteplexFactory(Entity_Info * e, unsigned char * buffer_bytes, int buffer_size);
77        virtual void DoByteplex() = 0;
78        virtual void PreparePseudoASCII_Stream() = 0;
79        virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
80        virtual void AdvanceInputBuffer(int advance_amt) = 0;
81        virtual int UTF8_Length(int name_pos, int lgth)=0;
82        virtual void to_UTF8(int name_pos, int lgth, char * u8_ptr)=0;
83        /* Source code unit buffer. */
84        BytePack * src_buffer;
85        int units_in_buffer;
86
87        /* Pseudo-ASCII stream. */
88        BytePack * x8data;
89
90protected:
91        FILE * infile;
92        int packs_in_buffer;
93        int CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read);
94        void Set_limits(int units_in_buffer);
95
96};
97
98
99/*  The X8_Buffer template class is used for either ASCII- or EBCDIC-
100    based 8-bit code units.
101    The X8_Buffer<ASCII> class includes 7-bit ASCII
102    (with high-order bit 0), the ISO-8859 character sets and UTF-8.
103
104    The family of 8-bit EBCDIC based character sets are processed using
105    the X8_Buffer<EBCDIC> class.
106*/
107
108template <CodeUnit_Base C>
109class X8_Buffer : public Byteplex {
110public:
111        static const CodeUnit_Base Base = C;
112        static const CodeUnit_Size Size = SingleByte;
113        X8_Buffer();
114        virtual ~X8_Buffer();
115
116        void DoByteplex();
117        void PreparePseudoASCII_Stream();
118        void AdvanceInputBuffer(int advance_amt);
119        void InitializeBuffer(unsigned char * src, int lgth);
120        int UTF8_Length(int name_pos, int lgth);
121        void to_UTF8(int name_pos, int lgth, char * u8_ptr);
122};
123
124class UTF8_Buffer : public Byteplex {
125public:
126        static const CodeUnit_Base Base = ASCII;
127        static const CodeUnit_Size Size = SingleByte;
128        UTF8_Buffer();
129        virtual ~UTF8_Buffer();
130
131        void DoByteplex();
132        void PreparePseudoASCII_Stream();
133        void AdvanceInputBuffer(int advance_amt);
134        void InitializeBuffer(unsigned char * src, int lgth);
135        int UTF8_Length(int name_pos, int lgth);
136        void to_UTF8(int name_pos, int lgth, char * u8_ptr);
137};
138
139
140/*  UTF-16 and UCS-2 character set families in BE and LE byte orders.
141    The U16LE and U16BE subclasses each provide a distinct byteplexer to
142    produce 2 parallel byte streams for the high and low bytes of each
143    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
144    conversion routine can be applied at the U16_Buffer level. */
145
146class U16_Buffer : public Byteplex {
147public:
148        static const CodeUnit_Base Base = ASCII;
149        static const CodeUnit_Size Size = DoubleByte;
150        U16_Buffer();
151        virtual ~U16_Buffer();
152        virtual void DoByteplex() = 0;
153        void PreparePseudoASCII_Stream();
154        void AdvanceInputBuffer(int advance_amt);
155        void Validate_UTF16();
156        void Validate_UCS2();
157        void InitializeBuffer(unsigned char * src, int lgth);
158        int UTF8_Length(int name_pos, int lgth);
159        void to_UTF8(int name_pos, int lgth, char * u8_ptr);
160protected:
161        BytePack * x16hi;
162        BytePack * x16lo;
163};
164
165class U16LE_Buffer : public U16_Buffer {
166public:
167        U16LE_Buffer();
168        void DoByteplex();
169};
170
171class U16BE_Buffer : public U16_Buffer {
172public:
173        U16BE_Buffer();
174        void DoByteplex();
175};
176
177
178/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
179    Each subclass of U32_Buffer provide a distinct byteplexer to
180    produce the 4 parallel byte streams of Unicode data.  Once
181    byteplexing is complete, a generic pseudoASCII routine can
182    be applied. */
183class U32_Buffer : public Byteplex {
184public:
185        static const CodeUnit_Base Base = ASCII;
186        static const CodeUnit_Size Size = QuadByte;
187        U32_Buffer();
188        virtual ~U32_Buffer();
189        virtual void DoByteplex() = 0;
190        void PreparePseudoASCII_Stream();
191        void AdvanceInputBuffer(int advance_amt);
192        void Validate_UTF32();
193        void InitializeBuffer(unsigned char * src, int lgth);
194        int UTF8_Length(int name_pos, int lgth);
195        void to_UTF8(int name_pos, int lgth, char * u8_ptr);
196protected:
197        BytePack * x32hh;
198        BytePack * x32hl;
199        BytePack * x32lh;
200        BytePack * x32ll;
201};
202
203class U32LE_Buffer : public U32_Buffer {
204public:
205        U32LE_Buffer();
206        void DoByteplex();
207};
208
209class U32BE_Buffer : public U32_Buffer {
210public:
211        U32BE_Buffer();
212        void DoByteplex();
213};
214
215class U32_2143_Buffer : public U32_Buffer {
216public:
217        U32_2143_Buffer();
218        void DoByteplex();
219};
220
221class U32_3412_Buffer : public U32_Buffer {
222public:
223        U32_3412_Buffer();
224        void DoByteplex();
225};
226
227
228inline char * copy_name (char * s, int lgth){           
229        char * d = new char[lgth+1];
230        memcpy(d, s,lgth); 
231        d[lgth] = '\0'; 
232        return d;
233}
234
235#endif
236
237
Note: See TracBrowser for help on using the repository browser.