source: trunk/src/byteplex.h @ 82

Last change on this file since 82 was 82, checked in by cameron, 11 years ago

Byteplex documentation

File size: 6.9 KB
Line 
1/*  byteplex.h - Parallel byte stream module.
2    Copyright (c) 2008,  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    This module has as its goal the buffering of XML byte data and
8    transformation of 16-bit and 32-bit code unit data so that the
9    parsing engine is provided a uniform representation based on
10    the concept of an 8-bit pseudo-ASCII representation (x8data).
11
12    A Byteplex object provides buffers for one to six parallel data
13    streams based for an XML input entity, depending on the size of
14    character code units. 
15       1.  In the case of 8-bit code units, a single byte stream
16           consisting of unmodified input data is maintained.
17           x8data = src_buffer
18       2.  In the case of 16-bit code units (UTF-16 and UCS-2 families),
19             (a) the original code unit stream is maintained unmodified,
20             (b) the x16hi byte stream is established for the high byte
21                 of each code unit,
22             (c) the x16lo byte stream is established for the low byte
23                 of each code unit, and
24             (d) x8data is established as the pseudo-ASCII byte stream,
25                 with ASCII code units having their proper 8-bit values,
26                 and all others having bit 0 set to 1.
27       3.  In the case of 32-bit code units (UTF-32 family),
28             (a) the original code unit stream is maintained unmodified,
29             (b) the x32hh byte stream has high bytes of each code unit
30             (c) the x32hl byte stream has second bytes of each code unit
31             (d) the x32lh byte stream has third bytes of each code unit
32             (e) the x32hh byte stream has low bytes of each code unit, and
33             (f) x8data is established as the pseudo-ASCII byte stream,
34                 with ASCII code units having their proper 8-bit values,
35                 and all others having bit 0 set to 1.
36
37    The pseudo-ASCII representation is defined for both ASCII-based
38    and EBCDIC-based character sets such that all characters in
39    the ASCII repertoire (i.e., having Unicode code points from 0x00
40    to 0x7F), are represented as themselves and no non-ASCII character
41    is represented as a character in the ASCII repertoire.
42
43*/
44
45*/
46#ifndef BYTEPLEX_H
47#define BYTEPLEX_H
48
49#include "xmlmodel.h"
50#include "../lib/lib_simd.h"
51
52/* The BytePack and the BitBlock are the two fundamental
53   types used by the parabix program for data held in
54   SIMD registers, representing, respectively, the byte-oriented
55   and bit-oriented views of character data.*/
56
57typedef SIMD_type BytePack;
58typedef SIMD_type BitBlock;
59const int PACKSIZE = sizeof(SIMD_type);
60const int BLOCKSIZE = sizeof(SIMD_type) * 8;
61
62/* Define the size of buffer used for lexical analysis/parsing. */
63const int BUFFER_BLOCKS = 1024;
64const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
65
66/* When working near the end of a buffer, a bytespace test may involve
67   a multibyte literal.  The bytespace buffer must always make available
68   a number of lookahead bytes at least equal to the maximum length of any
69   such literal. */
70
71const int LOOKAHEAD_POSITIONS = 16;
72const int BYTEPLEX_SIZE = BUFFER_SIZE + LOOKAHEAD_POSITIONS;
73
74class Byteplex {
75public:
76        static Byteplex * ByteplexFactory(Model_Info * m, FILE * inputfile);
77        virtual void DoByteplex() = 0;
78        virtual void PreparePseudoASCII_Stream() = 0;
79        virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
80        virtual void AdvanceInputBuffer(int advance_amt) = 0;
81
82        FILE * infile;
83
84        /* Source code unit buffer. */
85        BytePack * src_buffer;
86        int units_in_buffer;
87
88        /* Pseudo-ASCII stream. */
89        BytePack * x8data;
90
91protected:
92
93        unsigned int packs_in_buffer;
94        void Set_limits(int units_in_buffer);
95
96};
97
98template <CodeUnit_Base C>
99class XML_Buffer : public Byteplex {
100public:
101        XML_Buffer (FILE * infile);
102        ~XML_Buffer();
103       
104        virtual void DoByteplex() = 0;
105        virtual void PreparePseudoASCII_Stream() = 0;
106        virtual void AdvanceInputBuffer(int advance_amt) = 0;
107        virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
108protected:
109        int CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read);
110};
111
112
113/*  Various ASCII based character sets using 8-bit code units are processed
114    using the Extended_ASCII_8_Buffer class.   This includes 7-bit ASCII
115    itself (with high-order bit 0), the ISO-8859 character sets and UTF-8.
116*/
117class Extended_ASCII_8_Buffer : public XML_Buffer<ASCII> {
118public:
119        Extended_ASCII_8_Buffer(FILE * infile);
120        void DoByteplex();
121        void PreparePseudoASCII_Stream();
122        void AdvanceInputBuffer(int advance_amt);
123        void InitializeBuffer(unsigned char * src, int lgth);
124};
125
126/*  The family of 8-bit EBCDIC based character sets are processed using
127    the EBCDIC_Buffer class.
128*/
129class EBCDIC_Buffer : public XML_Buffer<EBCDIC> {
130public:
131        EBCDIC_Buffer(FILE * infile);
132        void DoByteplex();
133        void PreparePseudoASCII_Stream();
134        void AdvanceInputBuffer(int advance_amt);
135        void InitializeBuffer(unsigned char * src, int lgth);
136};
137
138
139
140/*  UTF-16 and UCS-4 character set families in BE and LE byte orders.
141    The U16LE and U16BE subclasses each provide a distinct byteplexer to
142    produce 2 parallel byte streams for the high and low bytes of each
143    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
144    conversion routine can be applied at the U16_Buffer level. */
145
146class U16_Buffer : public XML_Buffer<ASCII> {
147public:
148        U16_Buffer(FILE * infile);
149        ~U16_Buffer();
150        virtual void DoByteplex() = 0;
151        void PreparePseudoASCII_Stream();
152        void AdvanceInputBuffer(int advance_amt);
153        void InitializeBuffer(unsigned char * src, int lgth);
154protected:
155        BytePack * x16hi;
156        BytePack * x16lo;
157};
158
159class U16LE_Buffer : public U16_Buffer {
160public:
161        U16LE_Buffer(FILE * infile);
162        void DoByteplex();
163};
164
165class U16BE_Buffer : public U16_Buffer {
166public:
167        U16BE_Buffer(FILE * infile);
168        void DoByteplex();
169};
170
171
172/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
173    Each subclass of U32_Buffer provide a distinct byteplexer to
174    produce the 4 parallel byte streams of Unicode data.  Once
175    byteplexing is complete, a generic pseudoASCII routine can
176    be applied. */
177class U32_Buffer : public XML_Buffer<ASCII> {
178public:
179        U32_Buffer(FILE * infile);
180        ~U32_Buffer();
181        virtual void DoByteplex() = 0;
182        void PreparePseudoASCII_Stream();
183        void AdvanceInputBuffer(int advance_amt);
184        void InitializeBuffer(unsigned char * src, int lgth);
185protected:
186        BytePack * x32hh;
187        BytePack * x32hl;
188        BytePack * x32lh;
189        BytePack * x32ll;
190};
191
192class U32LE_Buffer : public U32_Buffer {
193public:
194        U32LE_Buffer(FILE * infile);
195        void DoByteplex();
196};
197
198class U32BE_Buffer : public U32_Buffer {
199public:
200        U32BE_Buffer(FILE * infile);
201        void DoByteplex();
202};
203
204class U32_2143_Buffer : public U32_Buffer {
205public:
206        U32_2143_Buffer(FILE * infile);
207        void DoByteplex();
208};
209
210class U32_3412_Buffer : public U32_Buffer {
211public:
212        U32_3412_Buffer(FILE * infile);
213        void DoByteplex();
214};
215
216#endif
Note: See TracBrowser for help on using the repository browser.