Changeset 825


Ignore:
Timestamp:
Dec 16, 2010, 7:05:41 AM (8 years ago)
Author:
cameron
Message:

Move bitmap base NameStart/NameChar? tests into namechars.h

Location:
trunk/src
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/namechars.h

    r122 r825  
    11/* namechars.h - Bitset maps for name characters.
    2     Copyright (c) 2008, Robert D. Cameron.
     2    Copyright (c) 2008, 2010 Robert D. Cameron.
    33    Licensed to the public under the Open Software License 3.0.
    44    Licensed to International Characters, Inc., under the Academic
     
    1313#ifndef NAMECHARS_H
    1414#define NAMECHARS_H
    15 
    16 #ifndef _MSC_VER
    17 #include <stdint.h>
    18 #endif
    19 #ifdef _MSC_VER
    20 #include "../../lib/stdint.h"
    21 #endif
    22 
    2315uint8_t NameStrt_XML10_0000_11FF[] = {
    2416    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20,
     
    255247    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
    256248
     249
     250
     251static inline bool bit_test(const unsigned char bit_Map[], const int codepoint) {
     252        return (bit_Map[codepoint>>3] >> (7 - (codepoint & 7))) & 1;
     253}
     254
     255/*  Is a given codepoint a legal NameStart character in XML 1.0 up to 4th edition? */
     256static inline bool is_XML10_NameStrt_codepoint(const int codepoint) {
     257        switch (codepoint >> 12) {
     258                case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     259                case 1: if (codepoint <= 0x11FF)
     260                                return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     261                        else if (codepoint < 0x1E00) return false;
     262                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     263                case 2: if (codepoint > 0x2182) return false;
     264                        else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
     265                case 3: if (codepoint > 0x312C) return false;
     266                        else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
     267                case 4: return codepoint >= 0x4E00;
     268                case 5: case 6: case 7: case 8: return true;
     269                case 9: return codepoint <= 0x9FA5;
     270                case 0xA: return codepoint >= 0xAC00;
     271                case 0xB: case 0xC: return true;
     272                case 0xD: return codepoint <= 0xD7A3;
     273                default: return false;
     274        }
     275}
     276
     277/*  Is a given codepoint a legal Name character in XML 1.0 up to 4th edition? */
     278static inline bool is_XML10_NameChar_codepoint(const int codepoint) {
     279        switch (codepoint >> 12) {
     280                case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
     281                case 1: if (codepoint <= 0x11FF)
     282                                return bit_test(NameChar_XML10_0000_11FF, codepoint);
     283                        else if (codepoint < 0x1E00) return false;
     284                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     285                case 2: if (codepoint > 0x2182) return false;
     286                        else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
     287                case 3: if (codepoint > 0x312C) return false;
     288                        else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
     289                case 4: return codepoint >= 0x4E00;
     290                case 5: case 6: case 7: case 8: return true;
     291                case 9: return codepoint <= 0x9FA5;
     292                case 0xA:       return codepoint >= 0xAC00;
     293                case 0xB: case 0xC: return true;
     294                case 0xD: return codepoint <= 0xD7A3;
     295                default: return false;
     296        }
     297}
     298
     299/*  Is a given codepoint a legal NameStart character in XML 1.0 5e or XML 1.1? */
     300static inline bool is_XML11_NameStrt_codepoint(const int codepoint) {
     301        if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
     302        else switch (codepoint >> 12) {
     303                case 0: case 1: return true;
     304                case 2: if (codepoint >= 0x2070)
     305                                if (codepoint <= 0x218F) return true;
     306                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     307                        else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
     308                case 3: return codepoint >= 0x3001;
     309                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     310                case 0xD: return codepoint <= 0xD7FF;
     311                case 0xE: return false;
     312                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     313                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     314                default: return codepoint <= 0xEFFFF;
     315        }
     316}
     317
     318/*  Is a given codepoint a legal Name character in XML 1.0 5e or XML 1.1? */
     319static inline bool is_XML11_NameChar_codepoint(const int codepoint) {
     320        if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
     321        else switch (codepoint >> 12) {
     322                case 0: case 1: return true;
     323                case 2: if (codepoint >= 0x2070)
     324                                if (codepoint <= 0x218F) return true;
     325                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     326                        else if (codepoint <= 0x200D) return codepoint >= 0x200C;
     327                        else return (codepoint == 0x203F) | (codepoint == 0x2040);
     328                case 3: return codepoint >= 0x3001;
     329                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     330                case 0xD: return codepoint <= 0xD7FF;
     331                case 0xE: return false;
     332                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     333                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     334                default: return codepoint <= 0xEFFFF;
     335        }
     336}
     337
     338/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     339    0 if the byte array does not begin with a valid character. 
     340    Use XML 1.0 rules up to 4th edition */
     341
     342static inline int XML_10_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     343        if (bytes[0] <= 0x7F) {
     344                if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
     345                else return 0;
     346        }
     347        else if (bytes[0] <= 0xDF) {
     348                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     349                if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
     350                else return 0;
     351        }
     352        else if (bytes[0] <= 0xEF) {
     353                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     354                return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
     355        }
     356        else return 0;
     357}
     358
     359/*  Return the number of UTF-8 bytes comprising a legal Name character,
     360    0 if the byte array does not begin with a valid character. 
     361    Use XML 1.0 rules up to 4th edition */
     362
     363static inline int XML_10_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     364        if (bytes[0] <= 0x7F) {
     365                if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
     366                else return 0;
     367        }
     368        else if (bytes[0] <= 0xDF) {
     369                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     370                if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
     371                else return 0;
     372        }
     373        else if (bytes[0] <= 0xEF) {
     374                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     375                return is_XML10_NameChar_codepoint(codepoint) ? 3 : 0;
     376        }
     377        else return 0;
     378}
     379
     380/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     381    0 if the byte array does not begin with a valid character. 
     382    Use rules for XML 5th edition/XML 1.1. */
     383
     384static inline int XML_11_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     385        if (bytes[0] <= 0x7F) {
     386                if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
     387                else return 0;
     388        }
     389        else if (bytes[0] <= 0xDF) {
     390                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     391                return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
     392        }
     393        else if (bytes[0] <= 0xEF) {
     394                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     395                return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
     396        }
     397        else {
     398                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     399                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     400                return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
     401        }
     402}
     403
     404/*  Return the number of UTF-8 bytes comprising a legal Name character,
     405    0 if the byte array does not begin with a valid character. 
     406    Use XML 1.0 rules up to 4th edition */
     407
     408static inline int XML_11_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     409        if (bytes[0] <= 0x7F) {
     410                if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
     411                else return 0;
     412        }
     413        else if (bytes[0] <= 0xDF) {
     414                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     415                return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
     416        }
     417        else if (bytes[0] <= 0xEF) {
     418                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     419                return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
     420        }
     421        else {
     422                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     423                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     424                return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
     425        }
     426}
     427
    257428#endif
  • trunk/src/symtab.c

    r267 r825  
    11#include "symtab.h"
    2 
    3 
    4 inline bool bit_test(unsigned char * bit_Map, int codepoint) {
    5         return (bit_Map[codepoint/8] >> (7 - codepoint % 8)) & 1;
    6 }
    7 
    8 
    9 bool is_XML10_NameStrt_codepoint(int codepoint) {
    10         switch (codepoint >> 12) {
    11                 case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
    12                 case 1: if (codepoint <= 0x11FF)
    13                                 return bit_test(NameStrt_XML10_0000_11FF, codepoint);
    14                         else if (codepoint < 0x1E00) return false;
    15                         else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
    16                 case 2: if (codepoint > 0x2182) return false;
    17                         else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
    18                 case 3: if (codepoint > 0x312C) return false;
    19                         else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
    20                 case 4: return codepoint >= 0x4E00;
    21                 case 5: case 6: case 7: case 8: return true;
    22                 case 9: return codepoint <= 0x9FA5;
    23                 case 0xA: return codepoint >= 0xAC00;
    24                 case 0xB: case 0xC: return true;
    25                 case 0xD: return codepoint <= 0xD7A3;
    26                 default: return false;
    27         }
    28 }
    29 
    30 bool is_XML10_NameChar_codepoint(int codepoint) {
    31         switch (codepoint >> 12) {
    32                 case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
    33                 case 1: if (codepoint <= 0x11FF)
    34                                 return bit_test(NameChar_XML10_0000_11FF, codepoint);
    35                         else if (codepoint < 0x1E00) return false;
    36                         else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
    37                 case 2: if (codepoint > 0x2182) return false;
    38                         else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
    39                 case 3: if (codepoint > 0x312C) return false;
    40                         else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
    41                 case 4: return codepoint >= 0x4E00;
    42                 case 5: case 6: case 7: case 8: return true;
    43                 case 9: return codepoint <= 0x9FA5;
    44                 case 0xA:       return codepoint >= 0xAC00;
    45                 case 0xB: case 0xC: return true;
    46                 case 0xD: return codepoint <= 0xD7A3;
    47                 default: return false;
    48         }
    49 }
    50 
    51 bool is_XML11_NameStrt_codepoint(int codepoint) {
    52         if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
    53         else switch (codepoint >> 12) {
    54                 case 0: case 1: return true;
    55                 case 2: if (codepoint >= 0x2070)
    56                                 if (codepoint <= 0x218F) return true;
    57                                 else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
    58                         else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
    59                 case 3: return codepoint >= 0x3001;
    60                 case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
    61                 case 0xD: return codepoint <= 0xD7FF;
    62                 case 0xE: return false;
    63                 case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
    64                           else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
    65                 default: return codepoint <= 0xEFFFF;
    66         }
    67 }
    68 
    69 bool is_XML11_NameChar_codepoint(int codepoint) {
    70         if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
    71         else switch (codepoint >> 12) {
    72                 case 0: case 1: return true;
    73                 case 2: if (codepoint >= 0x2070)
    74                                 if (codepoint <= 0x218F) return true;
    75                                 else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
    76                         else if (codepoint <= 0x200D) return codepoint >= 0x200C;
    77                         else return (codepoint == 0x203F) | (codepoint == 0x2040);
    78                 case 3: return codepoint >= 0x3001;
    79                 case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
    80                 case 0xD: return codepoint <= 0xD7FF;
    81                 case 0xE: return false;
    82                 case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
    83                           else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
    84                 default: return codepoint <= 0xEFFFF;
    85         }
    86 }
    87 
    88 inline int XML_10_UTF8_NameStrt_bytes (unsigned char bytes[]) {
    89         if (bytes[0] <= 0x7F) {
    90                 if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
    91                 else return 0;
    92         }
    93         else if (bytes[0] <= 0xDF) {
    94                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    95                 if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
    96                 else return 0;
    97         }
    98         else if (bytes[0] <= 0xEF) {
    99                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    100                 return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
    101         }
    102         else return 0;
    103 }
    104 
    105 inline int XML_10_UTF8_NameChar_bytes (unsigned char bytes[]) {
    106         if (bytes[0] <= 0x7F) {
    107                 if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
    108                 else return 0;
    109         }
    110         else if (bytes[0] <= 0xDF) {
    111                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    112                 if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
    113                 else return 0;
    114         }
    115         else if (bytes[0] <= 0xEF) {
    116                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    117                 return is_XML10_NameChar_codepoint(codepoint) ? 3 : 0;
    118         }
    119         else return 0;
    120 }
    121 
    122 inline int XML_11_UTF8_NameStrt_bytes (unsigned char bytes[]) {
    123         if (bytes[0] <= 0x7F) {
    124                 if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
    125                 else return 0;
    126         }
    127         else if (bytes[0] <= 0xDF) {
    128                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    129                 return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
    130         }
    131         else if (bytes[0] <= 0xEF) {
    132                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    133                 return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
    134         }
    135         else {
    136                 int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
    137                                 ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
    138                 return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
    139         }
    140 }
    141 
    142 inline int XML_11_UTF8_NameChar_bytes (unsigned char bytes[]) {
    143         if (bytes[0] <= 0x7F) {
    144                 if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
    145                 else return 0;
    146         }
    147         else if (bytes[0] <= 0xDF) {
    148                 int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
    149                 return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
    150         }
    151         else if (bytes[0] <= 0xEF) {
    152                 int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
    153                 return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
    154         }
    155         else {
    156                 int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
    157                                 ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
    158                 return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
    159         }
    160 }
    1612
    1623bool is_XML10_UTF8_Name(char protoname[], int lgth) {
  • trunk/src/xml_chars.py

    r220 r825  
    175175       
    176176namechars_header = r"""/* namechars.h - Bitset maps for name characters.
    177     Copyright (c) 2008, Robert D. Cameron.
     177    Copyright (c) 2008, 2010 Robert D. Cameron.
    178178    Licensed to the public under the Open Software License 3.0.
    179179    Licensed to International Characters, Inc., under the Academic
     
    188188#ifndef NAMECHARS_H
    189189#define NAMECHARS_H
    190 
    191 #ifndef _MSC_VER
    192 #include <stdint.h>
    193 #endif
    194 #ifdef _MSC_VER
    195 #include "../../lib/stdint.h"
    196 #endif
     190"""
     191
     192namechars_inlines = r"""
     193
     194static inline bool bit_test(const unsigned char bit_Map[], const int codepoint) {
     195        return (bit_Map[codepoint>>3] >> (7 - (codepoint & 7))) & 1;
     196}
     197
     198/*  Is a given codepoint a legal NameStart character in XML 1.0 up to 4th edition? */
     199static inline bool is_XML10_NameStrt_codepoint(const int codepoint) {
     200        switch (codepoint >> 12) {
     201                case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     202                case 1: if (codepoint <= 0x11FF)
     203                                return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     204                        else if (codepoint < 0x1E00) return false;
     205                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     206                case 2: if (codepoint > 0x2182) return false;
     207                        else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
     208                case 3: if (codepoint > 0x312C) return false;
     209                        else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
     210                case 4: return codepoint >= 0x4E00;
     211                case 5: case 6: case 7: case 8: return true;
     212                case 9: return codepoint <= 0x9FA5;
     213                case 0xA: return codepoint >= 0xAC00;
     214                case 0xB: case 0xC: return true;
     215                case 0xD: return codepoint <= 0xD7A3;
     216                default: return false;
     217        }
     218}
     219
     220/*  Is a given codepoint a legal Name character in XML 1.0 up to 4th edition? */
     221static inline bool is_XML10_NameChar_codepoint(const int codepoint) {
     222        switch (codepoint >> 12) {
     223                case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
     224                case 1: if (codepoint <= 0x11FF)
     225                                return bit_test(NameChar_XML10_0000_11FF, codepoint);
     226                        else if (codepoint < 0x1E00) return false;
     227                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     228                case 2: if (codepoint > 0x2182) return false;
     229                        else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
     230                case 3: if (codepoint > 0x312C) return false;
     231                        else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
     232                case 4: return codepoint >= 0x4E00;
     233                case 5: case 6: case 7: case 8: return true;
     234                case 9: return codepoint <= 0x9FA5;
     235                case 0xA:       return codepoint >= 0xAC00;
     236                case 0xB: case 0xC: return true;
     237                case 0xD: return codepoint <= 0xD7A3;
     238                default: return false;
     239        }
     240}
     241
     242/*  Is a given codepoint a legal NameStart character in XML 1.0 5e or XML 1.1? */
     243static inline bool is_XML11_NameStrt_codepoint(const int codepoint) {
     244        if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
     245        else switch (codepoint >> 12) {
     246                case 0: case 1: return true;
     247                case 2: if (codepoint >= 0x2070)
     248                                if (codepoint <= 0x218F) return true;
     249                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     250                        else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
     251                case 3: return codepoint >= 0x3001;
     252                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     253                case 0xD: return codepoint <= 0xD7FF;
     254                case 0xE: return false;
     255                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     256                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     257                default: return codepoint <= 0xEFFFF;
     258        }
     259}
     260
     261/*  Is a given codepoint a legal Name character in XML 1.0 5e or XML 1.1? */
     262static inline bool is_XML11_NameChar_codepoint(const int codepoint) {
     263        if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
     264        else switch (codepoint >> 12) {
     265                case 0: case 1: return true;
     266                case 2: if (codepoint >= 0x2070)
     267                                if (codepoint <= 0x218F) return true;
     268                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     269                        else if (codepoint <= 0x200D) return codepoint >= 0x200C;
     270                        else return (codepoint == 0x203F) | (codepoint == 0x2040);
     271                case 3: return codepoint >= 0x3001;
     272                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     273                case 0xD: return codepoint <= 0xD7FF;
     274                case 0xE: return false;
     275                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     276                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     277                default: return codepoint <= 0xEFFFF;
     278        }
     279}
     280
     281/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     282    0 if the byte array does not begin with a valid character. 
     283    Use XML 1.0 rules up to 4th edition */
     284
     285static inline int XML_10_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     286        if (bytes[0] <= 0x7F) {
     287                if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
     288                else return 0;
     289        }
     290        else if (bytes[0] <= 0xDF) {
     291                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     292                if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
     293                else return 0;
     294        }
     295        else if (bytes[0] <= 0xEF) {
     296                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     297                return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
     298        }
     299        else return 0;
     300}
     301
     302/*  Return the number of UTF-8 bytes comprising a legal Name character,
     303    0 if the byte array does not begin with a valid character. 
     304    Use XML 1.0 rules up to 4th edition */
     305
     306static inline int XML_10_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     307        if (bytes[0] <= 0x7F) {
     308                if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
     309                else return 0;
     310        }
     311        else if (bytes[0] <= 0xDF) {
     312                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     313                if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
     314                else return 0;
     315        }
     316        else if (bytes[0] <= 0xEF) {
     317                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     318                return is_XML10_NameChar_codepoint(codepoint) ? 3 : 0;
     319        }
     320        else return 0;
     321}
     322
     323/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     324    0 if the byte array does not begin with a valid character. 
     325    Use rules for XML 5th edition/XML 1.1. */
     326
     327static inline int XML_11_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     328        if (bytes[0] <= 0x7F) {
     329                if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
     330                else return 0;
     331        }
     332        else if (bytes[0] <= 0xDF) {
     333                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     334                return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
     335        }
     336        else if (bytes[0] <= 0xEF) {
     337                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     338                return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
     339        }
     340        else {
     341                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     342                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     343                return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
     344        }
     345}
     346
     347/*  Return the number of UTF-8 bytes comprising a legal Name character,
     348    0 if the byte array does not begin with a valid character. 
     349    Use XML 1.0 rules up to 4th edition */
     350
     351static inline int XML_11_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     352        if (bytes[0] <= 0x7F) {
     353                if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
     354                else return 0;
     355        }
     356        else if (bytes[0] <= 0xDF) {
     357                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     358                return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
     359        }
     360        else if (bytes[0] <= 0xEF) {
     361                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     362                return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
     363        }
     364        else {
     365                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     366                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     367                return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
     368        }
     369}
    197370
    198371"""
     
    215388        f.write(make_C_bit_map('NameStrt_XML11_0000_03FF', nsm11, 0, 0x03FF))
    216389        f.write(make_C_bit_map('NameChar_XML11_0000_03FF', ncm11, 0, 0x03FF))
     390        f.write(namechars_inlines);
    217391        f.write("#endif\n");
    218392        f.close()
Note: See TracChangeset for help on using the changeset viewer.