source: trunk/src/byteplex.h @ 83

Last change on this file since 83 was 83, checked in by cameron, 11 years ago

Byteplex.h excess */ deletion

File size: 6.9 KB
Line 
1/*  byteplex.h - Parallel byte stream module.
2    Copyright (c) 2008,  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    This module has as its goal the buffering of XML byte data and
8    transformation of 16-bit and 32-bit code unit data so that the
9    parsing engine is provided a uniform representation based on
10    the concept of an 8-bit pseudo-ASCII representation (x8data).
11
12    A Byteplex object provides buffers for one to six parallel data
13    streams based for an XML input entity, depending on the size of
14    character code units. 
15       1.  In the case of 8-bit code units, a single byte stream
16           consisting of unmodified input data is maintained.
17           x8data = src_buffer
18       2.  In the case of 16-bit code units (UTF-16 and UCS-2 families),
19             (a) the original code unit stream is maintained unmodified,
20             (b) the x16hi byte stream is established for the high byte
21                 of each code unit,
22             (c) the x16lo byte stream is established for the low byte
23                 of each code unit, and
24             (d) x8data is established as the pseudo-ASCII byte stream,
25                 with ASCII code units having their proper 8-bit values,
26                 and all others having bit 0 set to 1.
27       3.  In the case of 32-bit code units (UTF-32 family),
28             (a) the original code unit stream is maintained unmodified,
29             (b) the x32hh byte stream has high bytes of each code unit
30             (c) the x32hl byte stream has second bytes of each code unit
31             (d) the x32lh byte stream has third bytes of each code unit
32             (e) the x32hh byte stream has low bytes of each code unit, and
33             (f) x8data is established as the pseudo-ASCII byte stream,
34                 with ASCII code units having their proper 8-bit values,
35                 and all others having bit 0 set to 1.
36
37    The pseudo-ASCII representation is defined for both ASCII-based
38    and EBCDIC-based character sets such that all characters in
39    the ASCII repertoire (i.e., having Unicode code points from 0x00
40    to 0x7F), are represented as themselves and no non-ASCII character
41    is represented as a character in the ASCII repertoire.
42
43*/
44
45#ifndef BYTEPLEX_H
46#define BYTEPLEX_H
47
48#include "xmlmodel.h"
49#include "../lib/lib_simd.h"
50
51/* The BytePack and the BitBlock are the two fundamental
52   types used by the parabix program for data held in
53   SIMD registers, representing, respectively, the byte-oriented
54   and bit-oriented views of character data.*/
55
56typedef SIMD_type BytePack;
57typedef SIMD_type BitBlock;
58const int PACKSIZE = sizeof(SIMD_type);
59const int BLOCKSIZE = sizeof(SIMD_type) * 8;
60
61/* Define the size of buffer used for lexical analysis/parsing. */
62const int BUFFER_BLOCKS = 1024;
63const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
64
65/* When working near the end of a buffer, a bytespace test may involve
66   a multibyte literal.  The bytespace buffer must always make available
67   a number of lookahead bytes at least equal to the maximum length of any
68   such literal. */
69
70const int LOOKAHEAD_POSITIONS = 16;
71const int BYTEPLEX_SIZE = BUFFER_SIZE + LOOKAHEAD_POSITIONS;
72
73class Byteplex {
74public:
75        static Byteplex * ByteplexFactory(Model_Info * m, FILE * inputfile);
76        virtual void DoByteplex() = 0;
77        virtual void PreparePseudoASCII_Stream() = 0;
78        virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
79        virtual void AdvanceInputBuffer(int advance_amt) = 0;
80
81        FILE * infile;
82
83        /* Source code unit buffer. */
84        BytePack * src_buffer;
85        int units_in_buffer;
86
87        /* Pseudo-ASCII stream. */
88        BytePack * x8data;
89
90protected:
91
92        unsigned int packs_in_buffer;
93        void Set_limits(int units_in_buffer);
94
95};
96
97template <CodeUnit_Base C>
98class XML_Buffer : public Byteplex {
99public:
100        XML_Buffer (FILE * infile);
101        ~XML_Buffer();
102       
103        virtual void DoByteplex() = 0;
104        virtual void PreparePseudoASCII_Stream() = 0;
105        virtual void AdvanceInputBuffer(int advance_amt) = 0;
106        virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
107protected:
108        int CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read);
109};
110
111
112/*  Various ASCII based character sets using 8-bit code units are processed
113    using the Extended_ASCII_8_Buffer class.   This includes 7-bit ASCII
114    itself (with high-order bit 0), the ISO-8859 character sets and UTF-8.
115*/
116class Extended_ASCII_8_Buffer : public XML_Buffer<ASCII> {
117public:
118        Extended_ASCII_8_Buffer(FILE * infile);
119        void DoByteplex();
120        void PreparePseudoASCII_Stream();
121        void AdvanceInputBuffer(int advance_amt);
122        void InitializeBuffer(unsigned char * src, int lgth);
123};
124
125/*  The family of 8-bit EBCDIC based character sets are processed using
126    the EBCDIC_Buffer class.
127*/
128class EBCDIC_Buffer : public XML_Buffer<EBCDIC> {
129public:
130        EBCDIC_Buffer(FILE * infile);
131        void DoByteplex();
132        void PreparePseudoASCII_Stream();
133        void AdvanceInputBuffer(int advance_amt);
134        void InitializeBuffer(unsigned char * src, int lgth);
135};
136
137
138
139/*  UTF-16 and UCS-4 character set families in BE and LE byte orders.
140    The U16LE and U16BE subclasses each provide a distinct byteplexer to
141    produce 2 parallel byte streams for the high and low bytes of each
142    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
143    conversion routine can be applied at the U16_Buffer level. */
144
145class U16_Buffer : public XML_Buffer<ASCII> {
146public:
147        U16_Buffer(FILE * infile);
148        ~U16_Buffer();
149        virtual void DoByteplex() = 0;
150        void PreparePseudoASCII_Stream();
151        void AdvanceInputBuffer(int advance_amt);
152        void InitializeBuffer(unsigned char * src, int lgth);
153protected:
154        BytePack * x16hi;
155        BytePack * x16lo;
156};
157
158class U16LE_Buffer : public U16_Buffer {
159public:
160        U16LE_Buffer(FILE * infile);
161        void DoByteplex();
162};
163
164class U16BE_Buffer : public U16_Buffer {
165public:
166        U16BE_Buffer(FILE * infile);
167        void DoByteplex();
168};
169
170
171/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
172    Each subclass of U32_Buffer provide a distinct byteplexer to
173    produce the 4 parallel byte streams of Unicode data.  Once
174    byteplexing is complete, a generic pseudoASCII routine can
175    be applied. */
176class U32_Buffer : public XML_Buffer<ASCII> {
177public:
178        U32_Buffer(FILE * infile);
179        ~U32_Buffer();
180        virtual void DoByteplex() = 0;
181        void PreparePseudoASCII_Stream();
182        void AdvanceInputBuffer(int advance_amt);
183        void InitializeBuffer(unsigned char * src, int lgth);
184protected:
185        BytePack * x32hh;
186        BytePack * x32hl;
187        BytePack * x32lh;
188        BytePack * x32ll;
189};
190
191class U32LE_Buffer : public U32_Buffer {
192public:
193        U32LE_Buffer(FILE * infile);
194        void DoByteplex();
195};
196
197class U32BE_Buffer : public U32_Buffer {
198public:
199        U32BE_Buffer(FILE * infile);
200        void DoByteplex();
201};
202
203class U32_2143_Buffer : public U32_Buffer {
204public:
205        U32_2143_Buffer(FILE * infile);
206        void DoByteplex();
207};
208
209class U32_3412_Buffer : public U32_Buffer {
210public:
211        U32_3412_Buffer(FILE * infile);
212        void DoByteplex();
213};
214
215#endif
Note: See TracBrowser for help on using the repository browser.