Changeset 35 for trunk/src/xmlbuffer.h


Ignore:
Timestamp:
Feb 10, 2008, 6:08:48 AM (11 years ago)
Author:
cameron
Message:

Charset Architecture: Byteplexing Buffer Factory

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/xmlbuffer.h

    r15 r35  
    11/*  xmlbuffer.h - Input buffering for XML entities.
    2     Copyright (c) 2007, Robert D. Cameron.
     2    Copyright (c) 2007, 2008,  Robert D. Cameron.
    33    Licensed to the public under the Open Software License 3.0.
    44    Licensed to International Characters, Inc., under the Academic
    55    Free License 3.0.
    66
    7 
    8     The goal of the XML_Buffer class is to handle I/O issues for
    9     the XML parsing engine.   The present interface and implementation
    10     is a quick hack.
    117*/
    128#ifndef XML_BUFFER_H
    139#define XML_BUFFER_H
    1410
     11#include "xmlparam.h"
     12
     13#ifdef __i386
     14#include "../lib/sse_simd.h"
     15#endif
     16#ifdef _ARCH_PPC
     17#include "../lib/altivec_simd.h"
     18#endif
    1519
    1620
    17 class XML_Buffer {
     21/* The BytePack and the BitBlock are the two fundamental
     22   types used by the parabix program for data held in
     23   SIMD registers, representing, respectively, the byte-oriented
     24   and bit-oriented views of character data.*/
    1825
     26typedef SIMD_type BytePack;
     27typedef SIMD_type BitBlock;
     28const int PACKSIZE = sizeof(SIMD_type);
     29const int BLOCKSIZE = sizeof(SIMD_type) * 8;
     30
     31/* Define the size of buffer used for lexical analysis/parsing. */
     32const int BUFFER_BLOCKS = 1024;
     33const int BUFFER_SIZE = BUFFER_BLOCKS * BLOCKSIZE;
     34
     35/* When working near the end of a buffer, a bytespace test may involve
     36   a multibyte literal.  The bytespace buffer must always make available
     37   a number of lookahead bytes at least equal to the maximum length of any
     38   such literal. */
     39
     40const int LOOKAHEAD_POSITIONS = 16;
     41
     42
     43class XML_Buffer_Interface {
    1944public:
    20         XML_Buffer(char* filename, int pad_bytes);
    21         void InstallPadding(const unsigned char* pad_string);
    22         int PrepareBytes(int position, int bytes_to_prepare);
    23         unsigned char * BytePtr(int pos);
    24 private:
    25         unsigned char * ByteBuffer;
     45        XML_Buffer_Interface ();
     46        /* Create and initialize an XML Buffer object for parsing a file,
     47           based on autodetection of the character encoding family
     48           from the initial 4-byte signature. */
     49        static XML_Buffer_Interface * BufferFactory(char * filename);
     50        virtual void DoByteplex() = 0;
     51        virtual void PreparePseudoASCII_Stream() = 0;
     52        virtual void ReadXMLInfo() = 0;
     53        virtual void ReadTextDeclaration() = 0;
     54        int AvailableUnits(int pos);
     55        int ContentStartUnit();
     56
     57        /* Information returned from the initialization process.
     58           See xmlparam.h for types.  */
     59        int BOM_units; /* no of initial code units for a Byte Order Mark */
     60        XML_version version;
     61        bool has_encoding_decl;
     62        int encoding_start_pos;
     63        int encoding_lgth;
     64        CodeUnit_Base code_unit_base;
     65        CodeUnit_Size code_unit_size;
     66        CodeUnit_ByteOrder byte_order;
     67        XML_standalone standalone;
     68        int content_start_pos;
     69
     70        /* Pseudo-ASCII stream. */
     71        BytePack * x8data;
     72
     73protected:
     74
    2675        unsigned int buffer_bytes;
    27         unsigned int alloc_size; /* size of allocated ByteBuffer */
    28         int current_pos;
     76        unsigned int total_blocks;
     77        BytePack * ByteBuffer;
     78        int current_unit;
    2979
    3080};
    3181
     82template <CodeUnit_Base C>
     83class XML_Buffer : public XML_Buffer_Interface {
     84public:
     85        XML_Buffer (BytePack * src_data, int lgth, int byte_order_units);
     86        void ReadXMLInfo();
     87        void ReadTextDeclaration();
     88        virtual void DoByteplex() = 0;
     89        virtual void PreparePseudoASCII_Stream() = 0;
     90
     91protected:
     92
     93        void Advance(int n);
     94        int AbsPos() const;
     95        unsigned char * cur() const;
     96
     97private:
     98        /* Bytespace parsing routines for internal use in ReadXMLInfo and
     99           ReadTextDeclaration. */
     100        void Scan_WS();
     101        void ScanToQuote();
     102};
     103
     104
     105/*  Various ASCII based character sets using 8-bit code units are processed
     106    using the Extended_ASCII_8_Buffer class.   This includes 7-bit ASCII
     107    itself (with high-order bit 0), the ISO-8859 character sets and UTF-8.
     108*/
     109class Extended_ASCII_8_Buffer : public XML_Buffer<ASCII> {
     110public:
     111        Extended_ASCII_8_Buffer(BytePack * src, int lgth, int BOM);
     112        void DoByteplex();
     113        void PreparePseudoASCII_Stream();
     114};
     115
     116/*  The family of 8-bit EBCDIC based character sets are processed using
     117    the EBCDIC_Buffer class.
     118*/
     119class EBCDIC_Buffer : public XML_Buffer<EBCDIC> {
     120public:
     121        EBCDIC_Buffer(BytePack * src, int lgth, int BOM);
     122        void DoByteplex();
     123        void PreparePseudoASCII_Stream();
     124};
     125
     126
     127
     128/*  UTF-16 and UCS-4 character set families in BE and LE byte orders.
     129    The U16LE and U16BE subclasses each provide a distinct byteplexer to
     130    produce 2 parallel byte streams for the high and low bytes of each
     131    16-bit code unit.  Once byteplexing is complete, a generic pseudoASCII
     132    conversion routine can be applied at the U16_Buffer level. */
     133
     134class U16_Buffer : public XML_Buffer<ASCII> {
     135public:
     136        U16_Buffer(BytePack * src, int lgth, int BOM);
     137        virtual void DoByteplex() = 0;
     138        void PreparePseudoASCII_Stream();
     139protected:
     140        BytePack * x16hi;
     141        BytePack * x16lo;
     142};
     143
     144class U16LE_Buffer : public U16_Buffer {
     145public:
     146        U16LE_Buffer(BytePack * src, int lgth, int BOM);
     147        void DoByteplex();
     148};
     149
     150class U16BE_Buffer : public U16_Buffer {
     151public:
     152        U16BE_Buffer(BytePack * src, int lgth, int BOM);
     153        void DoByteplex();
     154};
     155
     156
     157/*  UTF-32/UCS-4 character sets in BE, LE, 2143 and 3412 byte orders.
     158    Each subclass of U32_Buffer provide a distinct byteplexer to
     159    produce the 4 parallel byte streams of Unicode data.  Once
     160    byteplexing is complete, a generic pseudoASCII routine can
     161    be applied. */
     162class U32_Buffer : public XML_Buffer<ASCII> {
     163public:
     164        U32_Buffer(BytePack * src, int lgth, int BOM);
     165        virtual void DoByteplex() = 0;
     166        void PreparePseudoASCII_Stream();
     167protected:
     168        BytePack * x32hh;
     169        BytePack * x32hl;
     170        BytePack * x32lh;
     171        BytePack * x32ll;
     172};
     173
     174class U32LE_Buffer : public U32_Buffer {
     175public:
     176        U32LE_Buffer(BytePack * src, int lgth, int BOM);
     177        void DoByteplex();
     178};
     179
     180class U32BE_Buffer : public U32_Buffer {
     181public:
     182        U32BE_Buffer(BytePack * src, int lgth, int BOM);
     183        void DoByteplex();
     184};
     185
     186class U32_2143_Buffer : public U32_Buffer {
     187public:
     188        U32_2143_Buffer(BytePack * src, int lgth, int BOM);
     189        void DoByteplex();
     190};
     191
     192class U32_3412_Buffer : public U32_Buffer {
     193public:
     194        U32_3412_Buffer(BytePack * src, int lgth, int BOM);
     195        void DoByteplex();
     196};
     197
     198
    32199#endif
Note: See TracChangeset for help on using the changeset viewer.