Changeset 825 for trunk/src/xml_chars.py


Ignore:
Timestamp:
Dec 16, 2010, 7:05:41 AM (8 years ago)
Author:
cameron
Message:

Move bitmap base NameStart/NameChar? tests into namechars.h

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/xml_chars.py

    r220 r825  
    175175       
    176176namechars_header = r"""/* namechars.h - Bitset maps for name characters.
    177     Copyright (c) 2008, Robert D. Cameron.
     177    Copyright (c) 2008, 2010 Robert D. Cameron.
    178178    Licensed to the public under the Open Software License 3.0.
    179179    Licensed to International Characters, Inc., under the Academic
     
    188188#ifndef NAMECHARS_H
    189189#define NAMECHARS_H
    190 
    191 #ifndef _MSC_VER
    192 #include <stdint.h>
    193 #endif
    194 #ifdef _MSC_VER
    195 #include "../../lib/stdint.h"
    196 #endif
     190"""
     191
     192namechars_inlines = r"""
     193
     194static inline bool bit_test(const unsigned char bit_Map[], const int codepoint) {
     195        return (bit_Map[codepoint>>3] >> (7 - (codepoint & 7))) & 1;
     196}
     197
     198/*  Is a given codepoint a legal NameStart character in XML 1.0 up to 4th edition? */
     199static inline bool is_XML10_NameStrt_codepoint(const int codepoint) {
     200        switch (codepoint >> 12) {
     201                case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     202                case 1: if (codepoint <= 0x11FF)
     203                                return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     204                        else if (codepoint < 0x1E00) return false;
     205                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     206                case 2: if (codepoint > 0x2182) return false;
     207                        else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
     208                case 3: if (codepoint > 0x312C) return false;
     209                        else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
     210                case 4: return codepoint >= 0x4E00;
     211                case 5: case 6: case 7: case 8: return true;
     212                case 9: return codepoint <= 0x9FA5;
     213                case 0xA: return codepoint >= 0xAC00;
     214                case 0xB: case 0xC: return true;
     215                case 0xD: return codepoint <= 0xD7A3;
     216                default: return false;
     217        }
     218}
     219
     220/*  Is a given codepoint a legal Name character in XML 1.0 up to 4th edition? */
     221static inline bool is_XML10_NameChar_codepoint(const int codepoint) {
     222        switch (codepoint >> 12) {
     223                case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
     224                case 1: if (codepoint <= 0x11FF)
     225                                return bit_test(NameChar_XML10_0000_11FF, codepoint);
     226                        else if (codepoint < 0x1E00) return false;
     227                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     228                case 2: if (codepoint > 0x2182) return false;
     229                        else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
     230                case 3: if (codepoint > 0x312C) return false;
     231                        else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
     232                case 4: return codepoint >= 0x4E00;
     233                case 5: case 6: case 7: case 8: return true;
     234                case 9: return codepoint <= 0x9FA5;
     235                case 0xA:       return codepoint >= 0xAC00;
     236                case 0xB: case 0xC: return true;
     237                case 0xD: return codepoint <= 0xD7A3;
     238                default: return false;
     239        }
     240}
     241
     242/*  Is a given codepoint a legal NameStart character in XML 1.0 5e or XML 1.1? */
     243static inline bool is_XML11_NameStrt_codepoint(const int codepoint) {
     244        if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
     245        else switch (codepoint >> 12) {
     246                case 0: case 1: return true;
     247                case 2: if (codepoint >= 0x2070)
     248                                if (codepoint <= 0x218F) return true;
     249                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     250                        else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
     251                case 3: return codepoint >= 0x3001;
     252                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     253                case 0xD: return codepoint <= 0xD7FF;
     254                case 0xE: return false;
     255                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     256                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     257                default: return codepoint <= 0xEFFFF;
     258        }
     259}
     260
     261/*  Is a given codepoint a legal Name character in XML 1.0 5e or XML 1.1? */
     262static inline bool is_XML11_NameChar_codepoint(const int codepoint) {
     263        if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
     264        else switch (codepoint >> 12) {
     265                case 0: case 1: return true;
     266                case 2: if (codepoint >= 0x2070)
     267                                if (codepoint <= 0x218F) return true;
     268                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     269                        else if (codepoint <= 0x200D) return codepoint >= 0x200C;
     270                        else return (codepoint == 0x203F) | (codepoint == 0x2040);
     271                case 3: return codepoint >= 0x3001;
     272                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     273                case 0xD: return codepoint <= 0xD7FF;
     274                case 0xE: return false;
     275                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     276                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     277                default: return codepoint <= 0xEFFFF;
     278        }
     279}
     280
     281/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     282    0 if the byte array does not begin with a valid character. 
     283    Use XML 1.0 rules up to 4th edition */
     284
     285static inline int XML_10_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     286        if (bytes[0] <= 0x7F) {
     287                if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
     288                else return 0;
     289        }
     290        else if (bytes[0] <= 0xDF) {
     291                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     292                if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
     293                else return 0;
     294        }
     295        else if (bytes[0] <= 0xEF) {
     296                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     297                return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
     298        }
     299        else return 0;
     300}
     301
     302/*  Return the number of UTF-8 bytes comprising a legal Name character,
     303    0 if the byte array does not begin with a valid character. 
     304    Use XML 1.0 rules up to 4th edition */
     305
     306static inline int XML_10_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     307        if (bytes[0] <= 0x7F) {
     308                if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
     309                else return 0;
     310        }
     311        else if (bytes[0] <= 0xDF) {
     312                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     313                if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
     314                else return 0;
     315        }
     316        else if (bytes[0] <= 0xEF) {
     317                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     318                return is_XML10_NameChar_codepoint(codepoint) ? 3 : 0;
     319        }
     320        else return 0;
     321}
     322
     323/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     324    0 if the byte array does not begin with a valid character. 
     325    Use rules for XML 5th edition/XML 1.1. */
     326
     327static inline int XML_11_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     328        if (bytes[0] <= 0x7F) {
     329                if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
     330                else return 0;
     331        }
     332        else if (bytes[0] <= 0xDF) {
     333                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     334                return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
     335        }
     336        else if (bytes[0] <= 0xEF) {
     337                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     338                return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
     339        }
     340        else {
     341                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     342                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     343                return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
     344        }
     345}
     346
     347/*  Return the number of UTF-8 bytes comprising a legal Name character,
     348    0 if the byte array does not begin with a valid character. 
     349    Use XML 1.0 rules up to 4th edition */
     350
     351static inline int XML_11_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     352        if (bytes[0] <= 0x7F) {
     353                if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
     354                else return 0;
     355        }
     356        else if (bytes[0] <= 0xDF) {
     357                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     358                return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
     359        }
     360        else if (bytes[0] <= 0xEF) {
     361                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     362                return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
     363        }
     364        else {
     365                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     366                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     367                return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
     368        }
     369}
    197370
    198371"""
     
    215388        f.write(make_C_bit_map('NameStrt_XML11_0000_03FF', nsm11, 0, 0x03FF))
    216389        f.write(make_C_bit_map('NameChar_XML11_0000_03FF', ncm11, 0, 0x03FF))
     390        f.write(namechars_inlines);
    217391        f.write("#endif\n");
    218392        f.close()
Note: See TracChangeset for help on using the changeset viewer.