Changeset 135


Ignore:
Timestamp:
May 6, 2008, 11:53:48 AM (11 years ago)
Author:
lindanl
Message:

SymbolTable? class and end tag checking.

Location:
trunk
Files:
1 added
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/markup_stats.cxx

    r134 r135  
    5959#include "src/bitlex.c"
    6060#include "src/engine.c"
     61#include "src/symtab.c"
    6162
    6263/* Global declarations of parsing engine. */
  • trunk/src/engine.c

    r134 r135  
    637637       
    638638        int name_start = AbsPos();
    639        
    640         int endNameID = Parse_Name();  /* Name delimiter: WS, "/" or ">" */
    641         if (nameID != endNameID)
     639        ScanTo(NameFollow);
     640        int lgth = AbsPos()-name_start;
     641        char * end_elem_name = (char *)GetCodeUnitPtr(name_start);
     642        char * start_elem_name = model_info->symbol_table->Get_UTF8_name(nameID);
     643       
     644        for(int i=0; i<lgth; i++) {
     645                if (start_elem_name[i] != end_elem_name[i])
    642646                        WF_Error(wfErr_GIMatch);
    643                        
     647        }
     648        if (start_elem_name[lgth] != '\0') WF_Error(wfErr_GIMatch);
    644649        if (AtChar<C,'>'>(cur())) {
    645650                Advance(1);
     
    18401845        Advance(2); /* Skip "</". */
    18411846       
     1847       
    18421848        int nameID = Parse_Name(); 
    18431849        int elemID = model_info->GlobalElementTable[nameID];
     
    21712177        ScanTo(NameFollow);
    21722178        int lgth = AbsPos()-name_pos;
    2173         char * s = copy_string(GetCodeUnitPtr(name_pos),lgth);
    2174         int nameID = model_info->GlobalNameTable[s];
    2175         if(nameID == 0){
    2176                 if (entity_Info->version == XML_1_1){
    2177                         if (!is_XML11_UTF8_Name(GetCodeUnitPtr(name_pos),lgth)) Syntax_Error(NT_Name);
    2178                 }
    2179                 else if (!is_XML10_UTF8_Name(GetCodeUnitPtr(name_pos),lgth)) Syntax_Error(NT_Name);
    2180                 model_info->GlobalNameTable[s]=++(model_info->globalNameCount);
    2181                 nameID = model_info->globalNameCount;
    2182         }
    2183         return nameID;
     2179        if (entity_Info->version == XML_1_1){
     2180                return model_info->symbol_table->UTF8_Lookup_or_Insert_XML11_Name((char *)GetCodeUnitPtr(name_pos),lgth);
     2181        }
     2182        else
     2183                return model_info->symbol_table->UTF8_Lookup_or_Insert_XML10_Name((char *)GetCodeUnitPtr(name_pos),lgth);
    21842184}
    21852185
  • trunk/src/symtab.h

    r124 r135  
    1111#include "namechars.h"
    1212
    13 inline bool bit_test(unsigned char * bit_Map, int codepoint) {
    14         return (bit_Map[codepoint/8] >> (7 - codepoint % 8)) & 1;
    15 }
     13struct eqstr
     14{
     15  bool operator()(const char* s1, const char* s2) const
     16  {
     17    return strcmp(s1, s2) == 0;
     18  }
     19};
     20       
     21char * predefined[] = {"lt", "gt", "amp", "quot", "apos"};
     22
     23class Symbol_Table{
     24public:
     25        Symbol_Table();
     26        int UTF8_Lookup_or_Insert_XML10_Name(char * name, int lgth);
     27        int UTF8_Lookup_or_Insert_XML11_Name(char * name, int lgth);
     28        char * Get_UTF8_name(int nameID);
     29private:
     30        hash_map<const char *, int, hash<const char *>, eqstr > UTF8NameMap;
     31        int globalNameCount;
     32        vector<char *> UTF8NameTable;
     33};
    1634
    1735
    18 bool is_XML10_NameStrt_codepoint(int codepoint) {
    19         switch (codepoint >> 12) {
    20                 case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
    21                 case 1: if (codepoint <= 0x11FF)
    22                                 return bit_test(NameStrt_XML10_0000_11FF, codepoint);
    23                         else if (codepoint < 0x1E00) return false;
    24                         else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
    25                 case 2: if (codepoint > 0x2182) return false;
    26                         else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
    27                 case 3: if (codepoint > 0x312C) return false;
    28                         else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
    29                 case 4: return codepoint >= 0x4E00;
    30                 case 5: case 6: case 7: case 8: return true;
    31                 case 9: return codepoint <= 0x9FA5;
    32                 case 0xA: return codepoint >= 0xAC00;
    33                 case 0xB: case 0xC: return true;
    34                 case 0xD: return codepoint <= 0xD7A3;
    35                 default: return false;
    36         }
    37 }
    38 
    39 bool is_XML10_NameChar_codepoint(int codepoint) {
    40         switch (codepoint >> 12) {
    41                 case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
    42                 case 1: if (codepoint <= 0x11FF)
    43                                 return bit_test(NameChar_XML10_0000_11FF, codepoint);
    44                         else if (codepoint < 0x1E00) return false;
    45                         else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
    46                 case 2: if (codepoint > 0x2182) return false;
    47                         else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
    48                 case 3: if (codepoint > 0x312C) return false;
    49                         else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
    50                 case 4: return codepoint >= 0x4E00;
    51                 case 5: case 6: case 7: case 8: return true;
    52                 case 9: return codepoint <= 0x9FA5;
    53                 case 0xA:       return codepoint >= 0xAC00;
    54                 case 0xB: case 0xC: return true;
    55                 case 0xD: return codepoint <= 0xD7A3;
    56                 default: return false;
    57         }
    58 }
    59 
    60 bool is_XML11_NameStrt_codepoint(int codepoint) {
    61         if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
    62         else switch (codepoint >> 12) {
    63                 case 0: case 1: return true;
    64                 case 2: if (codepoint >= 0x2070)
    65                                 if (codepoint <= 0x218F) return true;
    66                                 else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
    67                         else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
    68                 case 3: return codepoint >= 0x3001;
    69                 case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
    70                 case 0xD: return codepoint <= 0xD7FF;
    71                 case 0xE: return false;
    72                 case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
    73                           else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
    74                 default: return codepoint <= 0xEFFFF;
    75         }
    76 }
    77 
    78 bool is_XML11_NameChar_codepoint(int codepoint) {
    79         if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
    80         else switch (codepoint >> 12) {
    81                 case 0: case 1: return true;
    82                 case 2: if (codepoint >= 0x2070)
    83                                 if (codepoint <= 0x218F) return true;
    84                                 else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
    85                         else if (codepoint <= 0x200D) return codepoint >= 0x200C;
    86                         else return (codepoint == 0x203F) | (codepoint == 0x2040);
    87                 case 3: return codepoint >= 0x3001;
    88                 case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
    89                 case 0xD: return codepoint <= 0xD7FF;
    90                 case 0xE: return false;
    91                 case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
    92                           else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
    93                 default: return codepoint <= 0xEFFFF;
    94         }
    95 }
    96 
    97 inline int XML_10_UTF8_NameStrt_bytes (unsigned char bytes[]) {
    98         if (bytes[0] <= 0x7F) {
    99                 if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
    100                 else return 0;
    101         }
    102         else if (bytes[0] <= 0xDF) {
    103                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    104                 if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
    105                 else return 0;
    106         }
    107         else if (bytes[0] <= 0xEF) {
    108                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    109                 return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
    110         }
    111         else return 0;
    112 }
    113 
    114 inline int XML_10_UTF8_NameChar_bytes (unsigned char bytes[]) {
    115         if (bytes[0] <= 0x7F) {
    116                 if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
    117                 else return 0;
    118         }
    119         else if (bytes[0] <= 0xDF) {
    120                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    121                 if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
    122                 else return 0;
    123         }
    124         else if (bytes[0] <= 0xEF) {
    125                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    126                 return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
    127         }
    128         else return 0;
    129 }
    130 
    131 inline int XML_11_UTF8_NameStrt_bytes (unsigned char bytes[]) {
    132         if (bytes[0] <= 0x7F) {
    133                 if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
    134                 else return 0;
    135         }
    136         else if (bytes[0] <= 0xDF) {
    137                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    138                 return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
    139         }
    140         else if (bytes[0] <= 0xEF) {
    141                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    142                 return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
    143         }
    144         else {
    145                 int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
    146                                 ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
    147                 return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
    148         }
    149 }
    150 
    151 inline int XML_11_UTF8_NameChar_bytes (unsigned char bytes[]) {
    152         if (bytes[0] <= 0x7F) {
    153                 if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
    154                 else return 0;
    155         }
    156         else if (bytes[0] <= 0xDF) {
    157                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    158                 return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
    159         }
    160         else if (bytes[0] <= 0xEF) {
    161                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    162                 return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
    163         }
    164         else {
    165                 int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
    166                                 ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
    167                 return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
    168         }
    169 }
    170 
    171 bool is_XML10_UTF8_Name(unsigned char protoname[], int lgth) {
    172         int valid_bytes = XML_10_UTF8_NameStrt_bytes(protoname);
    173         int pos = valid_bytes;
    174         while ((valid_bytes > 0) & (pos < lgth)) {
    175                 valid_bytes = XML_10_UTF8_NameChar_bytes(&protoname[pos]);
    176                 pos += valid_bytes;
    177         }
    178         /* Success requires that every byte sequence processed be valid
    179            and that the total lgth processed be exactly that provided on
    180            input. */
    181         return (valid_bytes > 0) & (pos == lgth);
    182 }
    183 
    184 bool is_XML11_UTF8_Name(unsigned char protoname[], int lgth) {
    185         int valid_bytes = XML_11_UTF8_NameStrt_bytes(protoname);
    186         int pos = valid_bytes;
    187         while ((valid_bytes > 0) & (pos < lgth)) {
    188                 valid_bytes = XML_11_UTF8_NameChar_bytes(&protoname[pos]);
    189                 pos += valid_bytes;
    190         }
    191         /* Success requires that every byte sequence processed be valid
    192            and that the total lgth processed be exactly that provided on
    193            input. */
    194         return (valid_bytes > 0) & (pos == lgth);
    195 }
    196 
    19736#endif
  • trunk/src/xmlmodel.c

    r126 r135  
    1111
    1212void Model_Info::SimpleEntity(char * entity_Name, char * replText) {
    13                 int entity_NameID = GlobalNameTable[entity_Name];
    14                 if (entity_NameID == 0) {
    15                         GlobalNameTable[entity_Name] = ++(globalNameCount);
    16                         entity_NameID = globalNameCount;
    17                 }
     13               
     14                int entity_NameID = symbol_table->UTF8_Lookup_or_Insert_XML10_Name(entity_Name, strlen(entity_Name));
    1815                int entityID = GlobalGEntityTable[entity_NameID];
    1916                if(entityID==0){       
     
    3734        globalElementCount = 0;
    3835        globalAttributeCount = 0;
    39         globalNameCount = 0;
     36        symbol_table = new Symbol_Table();
    4037        SimpleEntity("lt", "<");
    4138        SimpleEntity("gt", ">");
  • trunk/src/xmlmodel.h

    r126 r135  
    3131
    3232#include "contentmodel.h"
     33#include "symtab.h"
    3334enum XML_version {XML_1_0, XML_1_1, no_XML_version_value};
    3435/* Documents may be encoded in accord with either XML 1.0 or XML 1.1,
     
    8485/* Possible kinds of attribute default in ATTLIST declarations. */
    8586
    86 struct eqstr
    87 {
    88   bool operator()(const char* s1, const char* s2) const
    89   {
    90     return strcmp(s1, s2) == 0;
    91   }
    92 };
    9387
    9488class ATT_info {
     
    170164        char * external_DTD_systemLiteral;
    171165        char * external_DTD_pubidLiteral;       
    172 
     166        Symbol_Table * symbol_table;
    173167   
    174168        /* Information computed from ATTLIST, ELEMENT, NOTATION and ENTITY declarations. */
     
    180174        hash_map<int, int > GlobalPEntityTable;
    181175       
    182         hash_map<const char *, int, hash<const char *>, eqstr > GlobalNameTable;
     176       
    183177        int globalElementCount;
    184178        int globalAttributeCount;
     
    186180        int globalGEntityCount;
    187181        int globalPEntityCount;
    188         int globalNameCount;
    189182    /* For each element, we have an ElementAttributeModel */
    190183        vector<vector<ATT_info *> > ElementAttributeData;
Note: See TracChangeset for help on using the changeset viewer.