source: trunk/src/byteplex.h @ 78

Last change on this file since 78 was 68, checked in by cameron, 11 years ago

Parallel Byte Stream Module refactoring

File size: 4.8 KB
Line 
1/*  byteplex.h - Parallel byte stream module.
2    Copyright (c) 2008,  Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BYTEPLEX_H
9#define BYTEPLEX_H
10
11#include "xmlmodel.h"
12#include "../lib/lib_simd.h"
13
14/* The BytePack and the BitBlock are the two fundamental
15   types used by the parabix program for data held in
16   SIMD registers, representing, respectively, the byte-oriented
17   and bit-oriented views of character data.*/
18
19typedef SIMD_type BytePack;
20typedef SIMD_type BitBlock;
21const int PACKSIZE = sizeof(SIMD_type);
22const int BLOCKSIZE = sizeof(SIMD_type) * 8;
23
24/* Define the size of buffer used for lexical analysis/parsing. */
25const int BUFFER_BLOCKS = 1024;
26const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
27
28/* When working near the end of a buffer, a bytespace test may involve
29   a multibyte literal.  The bytespace buffer must always make available
30   a number of lookahead bytes at least equal to the maximum length of any
31   such literal. */
32
33const int LOOKAHEAD_POSITIONS = 16;
34const int BYTEPLEX_SIZE = BUFFER_SIZE + LOOKAHEAD_POSITIONS;
35
36class Byteplex {
37public:
38        static Byteplex * ByteplexFactory(Model_Info * m, FILE * inputfile);
39        virtual void DoByteplex() = 0;
40        virtual void PreparePseudoASCII_Stream() = 0;
41        virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
42        virtual void AdvanceInputBuffer(int advance_amt) = 0;
43
44        FILE * infile;
45
46        /* Source code unit buffer. */
47        BytePack * src_buffer;
48        int units_in_buffer;
49
50        /* Pseudo-ASCII stream. */
51        BytePack * x8data;
52
53protected:
54
55        unsigned int packs_in_buffer;
56        void Set_limits(int units_in_buffer);
57
58};
59
60template <CodeUnit_Base C>
61class XML_Buffer : public Byteplex {
62public:
63        XML_Buffer (FILE * infile);
64        ~XML_Buffer();
65       
66        virtual void DoByteplex() = 0;
67        virtual void PreparePseudoASCII_Stream() = 0;
68        virtual void AdvanceInputBuffer(int advance_amt) = 0;
69        virtual void InitializeBuffer(unsigned char * src, int lgth) = 0;
70protected:
71        int CopyAndFill(unsigned char * bytes_to_copy, int lgth, int bytes_to_read);
72};
73
74
75/*  Various ASCII based character sets using 8-bit code units are processed
76    using the Extended_ASCII_8_Buffer class.   This includes 7-bit ASCII
77    itself (with high-order bit 0), the ISO-8859 character sets and UTF-8.
78*/
79class Extended_ASCII_8_Buffer : public XML_Buffer<ASCII> {
80public:
81        Extended_ASCII_8_Buffer(FILE * infile);
82        void DoByteplex();
83        void PreparePseudoASCII_Stream();
84        void AdvanceInputBuffer(int advance_amt);
85        void InitializeBuffer(unsigned char * src, int lgth);
86};
87
88/*  The family of 8-bit EBCDIC based character sets are processed using
89    the EBCDIC_Buffer class.
90*/
91class EBCDIC_Buffer : public XML_Buffer<EBCDIC> {
92public:
93        EBCDIC_Buffer(FILE * infile);
94        void DoByteplex();
95        void PreparePseudoASCII_Stream();
96        void AdvanceInputBuffer(int advance_amt);
97        void InitializeBuffer(unsigned char * src, int lgth);
98};
99
100
101
102/*  UTF-16 and UCS-4 character set families in BE and LE byte orders.
103    The U16LE and U16BE subclasses each provide a distinct byteplexer to
104    produce 2 parallel byte streams for the high and low bytes of each
105    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
106    conversion routine can be applied at the U16_Buffer level. */
107
108class U16_Buffer : public XML_Buffer<ASCII> {
109public:
110        U16_Buffer(FILE * infile);
111        ~U16_Buffer();
112        virtual void DoByteplex() = 0;
113        void PreparePseudoASCII_Stream();
114        void AdvanceInputBuffer(int advance_amt);
115        void InitializeBuffer(unsigned char * src, int lgth);
116protected:
117        BytePack * x16hi;
118        BytePack * x16lo;
119};
120
121class U16LE_Buffer : public U16_Buffer {
122public:
123        U16LE_Buffer(FILE * infile);
124        void DoByteplex();
125};
126
127class U16BE_Buffer : public U16_Buffer {
128public:
129        U16BE_Buffer(FILE * infile);
130        void DoByteplex();
131};
132
133
134/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
135    Each subclass of U32_Buffer provide a distinct byteplexer to
136    produce the 4 parallel byte streams of Unicode data.  Once
137    byteplexing is complete, a generic pseudoASCII routine can
138    be applied. */
139class U32_Buffer : public XML_Buffer<ASCII> {
140public:
141        U32_Buffer(FILE * infile);
142        ~U32_Buffer();
143        virtual void DoByteplex() = 0;
144        void PreparePseudoASCII_Stream();
145        void AdvanceInputBuffer(int advance_amt);
146        void InitializeBuffer(unsigned char * src, int lgth);
147protected:
148        BytePack * x32hh;
149        BytePack * x32hl;
150        BytePack * x32lh;
151        BytePack * x32ll;
152};
153
154class U32LE_Buffer : public U32_Buffer {
155public:
156        U32LE_Buffer(FILE * infile);
157        void DoByteplex();
158};
159
160class U32BE_Buffer : public U32_Buffer {
161public:
162        U32BE_Buffer(FILE * infile);
163        void DoByteplex();
164};
165
166class U32_2143_Buffer : public U32_Buffer {
167public:
168        U32_2143_Buffer(FILE * infile);
169        void DoByteplex();
170};
171
172class U32_3412_Buffer : public U32_Buffer {
173public:
174        U32_3412_Buffer(FILE * infile);
175        void DoByteplex();
176};
177
178#endif
Note: See TracBrowser for help on using the repository browser.