Changeset 135 for trunk/src/symtab.h


Ignore:
Timestamp:
May 6, 2008, 11:53:48 AM (11 years ago)
Author:
lindanl
Message:

SymbolTable? class and end tag checking.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/symtab.h

    r124 r135  
    1111#include "namechars.h"
    1212
    13 inline bool bit_test(unsigned char * bit_Map, int codepoint) {
    14         return (bit_Map[codepoint/8] >> (7 - codepoint % 8)) & 1;
    15 }
     13struct eqstr
     14{
     15  bool operator()(const char* s1, const char* s2) const
     16  {
     17    return strcmp(s1, s2) == 0;
     18  }
     19};
     20       
     21char * predefined[] = {"lt", "gt", "amp", "quot", "apos"};
     22
     23class Symbol_Table{
     24public:
     25        Symbol_Table();
     26        int UTF8_Lookup_or_Insert_XML10_Name(char * name, int lgth);
     27        int UTF8_Lookup_or_Insert_XML11_Name(char * name, int lgth);
     28        char * Get_UTF8_name(int nameID);
     29private:
     30        hash_map<const char *, int, hash<const char *>, eqstr > UTF8NameMap;
     31        int globalNameCount;
     32        vector<char *> UTF8NameTable;
     33};
    1634
    1735
    18 bool is_XML10_NameStrt_codepoint(int codepoint) {
    19         switch (codepoint >> 12) {
    20                 case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
    21                 case 1: if (codepoint <= 0x11FF)
    22                                 return bit_test(NameStrt_XML10_0000_11FF, codepoint);
    23                         else if (codepoint < 0x1E00) return false;
    24                         else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
    25                 case 2: if (codepoint > 0x2182) return false;
    26                         else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
    27                 case 3: if (codepoint > 0x312C) return false;
    28                         else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
    29                 case 4: return codepoint >= 0x4E00;
    30                 case 5: case 6: case 7: case 8: return true;
    31                 case 9: return codepoint <= 0x9FA5;
    32                 case 0xA: return codepoint >= 0xAC00;
    33                 case 0xB: case 0xC: return true;
    34                 case 0xD: return codepoint <= 0xD7A3;
    35                 default: return false;
    36         }
    37 }
    38 
    39 bool is_XML10_NameChar_codepoint(int codepoint) {
    40         switch (codepoint >> 12) {
    41                 case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
    42                 case 1: if (codepoint <= 0x11FF)
    43                                 return bit_test(NameChar_XML10_0000_11FF, codepoint);
    44                         else if (codepoint < 0x1E00) return false;
    45                         else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
    46                 case 2: if (codepoint > 0x2182) return false;
    47                         else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
    48                 case 3: if (codepoint > 0x312C) return false;
    49                         else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
    50                 case 4: return codepoint >= 0x4E00;
    51                 case 5: case 6: case 7: case 8: return true;
    52                 case 9: return codepoint <= 0x9FA5;
    53                 case 0xA:       return codepoint >= 0xAC00;
    54                 case 0xB: case 0xC: return true;
    55                 case 0xD: return codepoint <= 0xD7A3;
    56                 default: return false;
    57         }
    58 }
    59 
    60 bool is_XML11_NameStrt_codepoint(int codepoint) {
    61         if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
    62         else switch (codepoint >> 12) {
    63                 case 0: case 1: return true;
    64                 case 2: if (codepoint >= 0x2070)
    65                                 if (codepoint <= 0x218F) return true;
    66                                 else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
    67                         else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
    68                 case 3: return codepoint >= 0x3001;
    69                 case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
    70                 case 0xD: return codepoint <= 0xD7FF;
    71                 case 0xE: return false;
    72                 case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
    73                           else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
    74                 default: return codepoint <= 0xEFFFF;
    75         }
    76 }
    77 
    78 bool is_XML11_NameChar_codepoint(int codepoint) {
    79         if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
    80         else switch (codepoint >> 12) {
    81                 case 0: case 1: return true;
    82                 case 2: if (codepoint >= 0x2070)
    83                                 if (codepoint <= 0x218F) return true;
    84                                 else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
    85                         else if (codepoint <= 0x200D) return codepoint >= 0x200C;
    86                         else return (codepoint == 0x203F) | (codepoint == 0x2040);
    87                 case 3: return codepoint >= 0x3001;
    88                 case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
    89                 case 0xD: return codepoint <= 0xD7FF;
    90                 case 0xE: return false;
    91                 case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
    92                           else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
    93                 default: return codepoint <= 0xEFFFF;
    94         }
    95 }
    96 
    97 inline int XML_10_UTF8_NameStrt_bytes (unsigned char bytes[]) {
    98         if (bytes[0] <= 0x7F) {
    99                 if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
    100                 else return 0;
    101         }
    102         else if (bytes[0] <= 0xDF) {
    103                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    104                 if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
    105                 else return 0;
    106         }
    107         else if (bytes[0] <= 0xEF) {
    108                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    109                 return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
    110         }
    111         else return 0;
    112 }
    113 
    114 inline int XML_10_UTF8_NameChar_bytes (unsigned char bytes[]) {
    115         if (bytes[0] <= 0x7F) {
    116                 if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
    117                 else return 0;
    118         }
    119         else if (bytes[0] <= 0xDF) {
    120                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    121                 if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
    122                 else return 0;
    123         }
    124         else if (bytes[0] <= 0xEF) {
    125                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    126                 return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
    127         }
    128         else return 0;
    129 }
    130 
    131 inline int XML_11_UTF8_NameStrt_bytes (unsigned char bytes[]) {
    132         if (bytes[0] <= 0x7F) {
    133                 if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
    134                 else return 0;
    135         }
    136         else if (bytes[0] <= 0xDF) {
    137                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    138                 return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
    139         }
    140         else if (bytes[0] <= 0xEF) {
    141                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    142                 return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
    143         }
    144         else {
    145                 int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
    146                                 ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
    147                 return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
    148         }
    149 }
    150 
    151 inline int XML_11_UTF8_NameChar_bytes (unsigned char bytes[]) {
    152         if (bytes[0] <= 0x7F) {
    153                 if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
    154                 else return 0;
    155         }
    156         else if (bytes[0] <= 0xDF) {
    157                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    158                 return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
    159         }
    160         else if (bytes[0] <= 0xEF) {
    161                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    162                 return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
    163         }
    164         else {
    165                 int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
    166                                 ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
    167                 return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
    168         }
    169 }
    170 
    171 bool is_XML10_UTF8_Name(unsigned char protoname[], int lgth) {
    172         int valid_bytes = XML_10_UTF8_NameStrt_bytes(protoname);
    173         int pos = valid_bytes;
    174         while ((valid_bytes > 0) & (pos < lgth)) {
    175                 valid_bytes = XML_10_UTF8_NameChar_bytes(&protoname[pos]);
    176                 pos += valid_bytes;
    177         }
    178         /* Success requires that every byte sequence processed be valid
    179            and that the total lgth processed be exactly that provided on
    180            input. */
    181         return (valid_bytes > 0) & (pos == lgth);
    182 }
    183 
    184 bool is_XML11_UTF8_Name(unsigned char protoname[], int lgth) {
    185         int valid_bytes = XML_11_UTF8_NameStrt_bytes(protoname);
    186         int pos = valid_bytes;
    187         while ((valid_bytes > 0) & (pos < lgth)) {
    188                 valid_bytes = XML_11_UTF8_NameChar_bytes(&protoname[pos]);
    189                 pos += valid_bytes;
    190         }
    191         /* Success requires that every byte sequence processed be valid
    192            and that the total lgth processed be exactly that provided on
    193            input. */
    194         return (valid_bytes > 0) & (pos == lgth);
    195 }
    196 
    19736#endif
Note: See TracChangeset for help on using the changeset viewer.