Changeset 163


Ignore:
Timestamp:
Jun 22, 2008, 1:45:20 PM (11 years ago)
Author:
cameron
Message:

Restructuring: Document/Externalt? Entity Info into xmldecl.h

Location:
trunk
Files:
9 edited

Legend:

Unmodified
Added
Removed
  • trunk/Makefile

    r159 r163  
    2222                src/bitplex.h src/bitplex.c src/xmlmodel.h src/xmlmodel.c\
    2323                src/bytelex.h src/charsets/ASCII_EBCDIC.h
    24         $(CC) -o markup_stats markup_stats.cxx $(AFLAGS) $(PAPI) -DCALC_AVG     
     24        $(CC) -o markup_stats markup_stats.cxx $(AFLAGS) -DCALC_AVG     
    2525
    2626markup_stats_omp:       markup_stats.cxx src/byteplex.h src/multiliteral.h src/engine.h\
  • trunk/src/bitlex.h

    r129 r163  
    99#define BITLEX_H
    1010
    11 #include "xmlmodel.h"
     11#include "xmldecl.h"
    1212#include "byteplex.h"
    1313#include "bitplex.h"
    14 #include "xmldecl.h"
    1514
    1615/* Lexical items are particular characters, character classes
  • trunk/src/bytelex.h

    r162 r163  
    99#define BYTELEX_H
    1010
    11 #include "xmlmodel.h"
     11#include "xmldecl.h"
    1212#include "multiliteral.h"
    1313
  • trunk/src/byteplex.h

    r160 r163  
    4646#define BYTEPLEX_H
    4747
    48 #include "xmlmodel.h"
     48#include "xmldecl.h"
    4949#include "../lib/lib_simd.h"
    5050
  • trunk/src/multiliteral.h

    r91 r163  
    2020#include <assert.h>
    2121#include <stdint.h>
    22 #include "xmlmodel.h"
     22#include "xmldecl.h"
    2323#include "charsets/ASCII_EBCDIC.h"
    2424
  • trunk/src/xmldecl.c

    r162 r163  
    1717#include "bytelex.h"
    1818#include "xmlmodel.h"
     19
     20
     21Entity_Info::Entity_Info() {
     22}
     23Entity_Info::~Entity_Info() {
     24        free(encoding);
     25}
     26
     27/* Signature-based character set family detection in accord with
     28   Appendix F of the XML 1.0 and 1.1 specifications. */
     29
     30/* These definitions use b2int16 to determine appropriate doublebyte
     31   values based on endianness of the underlying architecture. */
     32static const int x0000 = b2int16<0x00, 0x00>::value;
     33static const int xFEFF = b2int16<0xFE, 0xFF>::value;
     34static const int xFFFE = b2int16<0xFF, 0xFE>::value;
     35static const int x003C = b2int16<0x00, 0x3C>::value;
     36static const int x3C00 = b2int16<0x3C, 0x00>::value;
     37static const int x4C6F = b2int16<0x4C, 0x6F>::value;
     38static const int xA794 = b2int16<0xA7, 0x94>::value;
     39static const int xEFBE = b2int16<0xEF, 0xBE>::value;
     40
     41void Entity_Info::AnalyzeSignature(unsigned char * signature) {
     42        uint16_t * XML_dbl_byte = (uint16_t *) signature;
     43        switch (XML_dbl_byte[0]) {
     44                case x0000:
     45                        switch (XML_dbl_byte[1]) {
     46                                case xFEFF: set_charset_family(ASCII, QuadByte, BigEndian, 1);break;
     47                                case xFFFE: set_charset_family(ASCII, QuadByte, Unusual_2143, 1);break;
     48                                case x3C00: set_charset_family(ASCII, QuadByte, Unusual_2143, 0);break;
     49                                default: set_charset_family(ASCII, QuadByte, BigEndian, 0);
     50                        }
     51                        break;
     52                case xFEFF:
     53                        if (XML_dbl_byte[1] == x0000)
     54                                set_charset_family(ASCII, QuadByte, Unusual_3412, 1);
     55                        else set_charset_family(ASCII, DoubleByte, BigEndian, 1);
     56                        break;
     57                case xFFFE:
     58                        if (XML_dbl_byte[1] == x0000)
     59                                set_charset_family(ASCII, QuadByte, LittleEndian, 1);
     60                        else set_charset_family(ASCII, DoubleByte, LittleEndian, 1);
     61                        break;
     62                case x003C:
     63                        if (XML_dbl_byte[1] == x0000)
     64                                set_charset_family(ASCII, QuadByte, Unusual_3412, 0);
     65                        else set_charset_family(ASCII, DoubleByte, BigEndian, 0);
     66                        break;
     67                case x3C00:
     68                        if (XML_dbl_byte[1] == x0000)
     69                                set_charset_family(ASCII, QuadByte, LittleEndian, 0);
     70                        else set_charset_family(ASCII, DoubleByte, LittleEndian, 0);
     71                        break;
     72                case x4C6F:
     73                        if (XML_dbl_byte[1] == xA794)
     74                                set_charset_family(EBCDIC, SingleByte, BigEndian, 0);
     75                        else set_charset_family(ASCII, SingleByte, BigEndian, 0);
     76                        break;
     77                case xEFBE:
     78                        if (signature[2] == 0xBF)
     79                                set_charset_family(ASCII, SingleByte, BigEndian, 3);
     80                        else set_charset_family(ASCII, SingleByte, BigEndian, 0);
     81                        break;
     82                default:
     83                        set_charset_family(ASCII, SingleByte, BigEndian, 0);
     84        }
     85}
     86void Entity_Info::set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B){
     87                code_unit_base = C;
     88                code_unit_size = S;
     89                byte_order = O;
     90                BOM_units = B;
     91 }
     92
    1993
    2094template <CodeUnit_Base C>
  • trunk/src/xmldecl.h

    r160 r163  
    99#define XML_DECL_H
    1010
     11enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
     12/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
     13   or there may be no XML version declared ("no value" in the
     14   XML infoset parlance). */
     15
     16enum CodeUnit_Base {ASCII, EBCDIC};
     17
     18/* Code units of the underlying character set may be either ASCII-compatible
     19   or EBCDIC-compatible.
     20   ASCII-compatibility means that any code units satisfy the following properties.
     21     (1) Any code unit whose numeric value is in the ASinclude "byteplex.h"CII range (0 to 0x7F)
     22         is a complete character sequence (single code unit sequence) representing
     23         that ASCII character.
     24     (2) Any code units above the ASCII range are non-ASCII code units.
     25         No code units or code unit sequences containing a non-ASCII code unit
     26         may represent an ASCII character.  (This property ensures that
     27         non-ASCII code units may be ignored in making ASCII-based parsing decisions).
     28   EBCDIC-compatible, for the purposes of XML, means that the following property
     29         applies.include "byteplex.h"
     30
     31     (*) Code units may form all or part of a code unit sequence representing
     32         a character in the Unicode range 0 to 0x9F if and only if that code
     33         unit has the same interpretation unde the basic EBCDIC code page cp037.
     34*/
     35
     36enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
     37/* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
     38   The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
     39   The UTF-32/UCS-4 family has 32-bit code units. */
     40
     41enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
     42/* The byte order of 16-bit or 32-bit code units.  The possibilities are:
     43   BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
     44               UTF-16 without a byte order mark,
     45               UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
     46   LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
     47                 UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
     48   Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
     49   Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
     50*/
     51
     52enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
     53/* Possible values depending on the optional standalone component of an
     54   XML declaration. */
     55
     56class Entity_Info {
     57       
     58public:
     59        Entity_Info();
     60        ~Entity_Info();
     61
     62        /*  Information computed by analyzing the 4-byte initial signature
     63            of an XML document. */
     64        int BOM_units; /* no of initial code units for a Byte Order Mark */
     65
     66        CodeUnit_Base code_unit_base;
     67        CodeUnit_Size code_unit_size;
     68        CodeUnit_ByteOrder byte_order; 
     69
     70        void AnalyzeSignature(unsigned char * signature);
     71
     72        /* Information computed from the XML or text declaration. */
     73        XML_version version;
     74        bool has_encoding_decl;
     75        unsigned char * encoding;
     76        XML_standalone standalone;
     77        int content_start;  /* position after BOM and XML/text decl.*/
     78       
     79private:
     80        void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
     81};
     82
     83
    1184#include "byteplex.h"
    12 #include "xmlmodel.h"
    1385
    1486template <CodeUnit_Base C>
  • trunk/src/xmlmodel.c

    r160 r163  
    4646}
    4747
    48 Entity_Info::Entity_Info() {
    49 }
    50 Entity_Info::~Entity_Info() {
    51         free(encoding);
    52 }
    53 
    54 /* Signature-based character set family detection in accord with
    55    Appendix F of the XML 1.0 and 1.1 specifications. */
    56 
    57 /* These definitions use b2int16 to determine appropriate doublebyte
    58    values based on endianness of the underlying architecture. */
    59 static const int x0000 = b2int16<0x00, 0x00>::value;
    60 static const int xFEFF = b2int16<0xFE, 0xFF>::value;
    61 static const int xFFFE = b2int16<0xFF, 0xFE>::value;
    62 static const int x003C = b2int16<0x00, 0x3C>::value;
    63 static const int x3C00 = b2int16<0x3C, 0x00>::value;
    64 static const int x4C6F = b2int16<0x4C, 0x6F>::value;
    65 static const int xA794 = b2int16<0xA7, 0x94>::value;
    66 static const int xEFBE = b2int16<0xEF, 0xBE>::value;
    67 
    68 void Entity_Info::AnalyzeSignature(unsigned char * signature) {
    69         uint16_t * XML_dbl_byte = (uint16_t *) signature;
    70         switch (XML_dbl_byte[0]) {
    71                 case x0000:
    72                         switch (XML_dbl_byte[1]) {
    73                                 case xFEFF: set_charset_family(ASCII, QuadByte, BigEndian, 1);break;
    74                                 case xFFFE: set_charset_family(ASCII, QuadByte, Unusual_2143, 1);break;
    75                                 case x3C00: set_charset_family(ASCII, QuadByte, Unusual_2143, 0);break;
    76                                 default: set_charset_family(ASCII, QuadByte, BigEndian, 0);
    77                         }
    78                         break;
    79                 case xFEFF:
    80                         if (XML_dbl_byte[1] == x0000)
    81                                 set_charset_family(ASCII, QuadByte, Unusual_3412, 1);
    82                         else set_charset_family(ASCII, DoubleByte, BigEndian, 1);
    83                         break;
    84                 case xFFFE:
    85                         if (XML_dbl_byte[1] == x0000)
    86                                 set_charset_family(ASCII, QuadByte, LittleEndian, 1);
    87                         else set_charset_family(ASCII, DoubleByte, LittleEndian, 1);
    88                         break;
    89                 case x003C:
    90                         if (XML_dbl_byte[1] == x0000)
    91                                 set_charset_family(ASCII, QuadByte, Unusual_3412, 0);
    92                         else set_charset_family(ASCII, DoubleByte, BigEndian, 0);
    93                         break;
    94                 case x3C00:
    95                         if (XML_dbl_byte[1] == x0000)
    96                                 set_charset_family(ASCII, QuadByte, LittleEndian, 0);
    97                         else set_charset_family(ASCII, DoubleByte, LittleEndian, 0);
    98                         break;
    99                 case x4C6F:
    100                         if (XML_dbl_byte[1] == xA794)
    101                                 set_charset_family(EBCDIC, SingleByte, BigEndian, 0);
    102                         else set_charset_family(ASCII, SingleByte, BigEndian, 0);
    103                         break;
    104                 case xEFBE:
    105                         if (signature[2] == 0xBF)
    106                                 set_charset_family(ASCII, SingleByte, BigEndian, 3);
    107                         else set_charset_family(ASCII, SingleByte, BigEndian, 0);
    108                         break;
    109                 default:
    110                         set_charset_family(ASCII, SingleByte, BigEndian, 0);
    111         }
    112 }
    113 void Entity_Info::set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B){
    114                 code_unit_base = C;
    115                 code_unit_size = S;
    116                 byte_order = O;
    117                 BOM_units = B;
    118  }
    11948int Model_Info::getOrInsertGlobalElement(int elem_nameID) {
    12049        int elemID =  GlobalElementTable[elem_nameID];
  • trunk/src/xmlmodel.h

    r160 r163  
    2222#ifndef XMLMODEL_H
    2323#define XMLMODEL_H
     24
     25//  Encoding signature, XML declaration processing included in xmldecl.h
     26#include "xmldecl.h"
     27
    2428#include <vector>
    2529#include <iostream>
     
    3236#include "contentmodel.h"
    3337#include "symtab.h"
    34 enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
    35 /* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
    36    or there may be no XML version declared ("no value" in the
    37    XML infoset parlance). */
    38 
    39 enum CodeUnit_Base {ASCII, EBCDIC};
    40 /* Code units of the underlying character set may be either ASCII-compatible
    41    or EBCDIC-compatible.
    42    ASCII-compatibility means that any code units satisfy the following properties.
    43      (1) Any code unit whose numeric value is in the ASCII range (0 to 0x7F)
    44          is a complete character sequence (single code unit sequence) representing
    45          that ASCII character.
    46      (2) Any code units above the ASCII range are non-ASCII code units.
    47          No code units or code unit sequences containing a non-ASCII code unit
    48          may represent an ASCII character.  (This property ensures that
    49          non-ASCII code units may be ignored in making ASCII-based parsing decisions).
    50    EBCDIC-compatible, for the purposes of XML, means that the following property
    51          applies.
    52      (*) Code units may form all or part of a code unit sequence representing
    53          a character in the Unicode range 0 to 0x9F if and only if that code
    54          unit has the same interpretation unde the basic EBCDIC code page cp037.
    55 */
    56 
    57 enum CodeUnit_Size {SingleByte = 1, DoubleByte = 2, QuadByte = 4};
    58 /* ASCII, EBCDIC, ISO-8859-X and UTF-8 have 8-bit code units (singlebytes);
    59    The UTF-16 and UCS-2 families have 16-bit code units (doublebyte);
    60    The UTF-32/UCS-4 family has 32-bit code units. */
    61 
    62 enum CodeUnit_ByteOrder {BigEndian, LittleEndian, Unusual_3412, Unusual_2143};
    63 /* The byte order of 16-bit or 32-bit code units.  The possibilities are:
    64    BigEndian:  UTF-16BE, UCS-2BE, UTF-16 or UCS-2 with a BigEndian byte order mark,
    65                UTF-16 without a byte order mark,
    66                UTF-32BE/UCS-4BE, or UTF-32/UCS-4 with a BigEndian byte order mark.
    67    LittleEndian: UTF-16LE, UCS-2LE, UTF-16 or UCS-2 with a LittleEndian byte order mark.
    68                  UTF-32LE/UCS-4LE, or UTF-32/UCS-4 with a LittleEndian byte order mark.
    69    Unusual_3412: Unusual octet order of UTF-32/UCS-4 with byte order mark FE FF 00 00
    70    Unusual_2143: Unusual octet order of UTF-32/UCS-4 with byte order mark 00 00 FF FE.
    71 */
    72 
    73 enum XML_standalone {Standalone_yes, Standalone_no, Standalone_no_value};
    74 /* Possible values depending on the optional standalone component of an
    75    XML declaration. */
    7638
    7739
     
    13092
    13193
    132 class Entity_Info {
    133        
    134 public:
    135         Entity_Info();
    136         ~Entity_Info();
    137 
    138         /*  Information computed by analyzing the 4-byte initial signature
    139             of an XML document. */
    140         int BOM_units; /* no of initial code units for a Byte Order Mark */
    141 
    142         CodeUnit_Base code_unit_base;
    143         CodeUnit_Size code_unit_size;
    144         CodeUnit_ByteOrder byte_order; 
    145 
    146         void AnalyzeSignature(unsigned char * signature);
    147 
    148         /* Information computed from the XML or text declaration. */
    149         XML_version version;
    150         bool has_encoding_decl;
    151         unsigned char * encoding;
    152         XML_standalone standalone;
    153         int content_start;  /* position after BOM and XML/text decl.*/
    154        
    155 private:
    156         void set_charset_family(CodeUnit_Base C, CodeUnit_Size S, CodeUnit_ByteOrder O, int B);
    157 };
    158 
    15994class Model_Info {
    16095       
Note: See TracChangeset for help on using the changeset viewer.