Changeset 825 for trunk/src/namechars.h


Ignore:
Timestamp:
Dec 16, 2010, 7:05:41 AM (8 years ago)
Author:
cameron
Message:

Move bitmap base NameStart/NameChar? tests into namechars.h

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/namechars.h

    r122 r825  
    11/* namechars.h - Bitset maps for name characters.
    2     Copyright (c) 2008, Robert D. Cameron.
     2    Copyright (c) 2008, 2010 Robert D. Cameron.
    33    Licensed to the public under the Open Software License 3.0.
    44    Licensed to International Characters, Inc., under the Academic
     
    1313#ifndef NAMECHARS_H
    1414#define NAMECHARS_H
    15 
    16 #ifndef _MSC_VER
    17 #include <stdint.h>
    18 #endif
    19 #ifdef _MSC_VER
    20 #include "../../lib/stdint.h"
    21 #endif
    22 
    2315uint8_t NameStrt_XML10_0000_11FF[] = {
    2416    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20,
     
    255247    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
    256248
     249
     250
     251static inline bool bit_test(const unsigned char bit_Map[], const int codepoint) {
     252        return (bit_Map[codepoint>>3] >> (7 - (codepoint & 7))) & 1;
     253}
     254
     255/*  Is a given codepoint a legal NameStart character in XML 1.0 up to 4th edition? */
     256static inline bool is_XML10_NameStrt_codepoint(const int codepoint) {
     257        switch (codepoint >> 12) {
     258                case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     259                case 1: if (codepoint <= 0x11FF)
     260                                return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     261                        else if (codepoint < 0x1E00) return false;
     262                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     263                case 2: if (codepoint > 0x2182) return false;
     264                        else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
     265                case 3: if (codepoint > 0x312C) return false;
     266                        else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
     267                case 4: return codepoint >= 0x4E00;
     268                case 5: case 6: case 7: case 8: return true;
     269                case 9: return codepoint <= 0x9FA5;
     270                case 0xA: return codepoint >= 0xAC00;
     271                case 0xB: case 0xC: return true;
     272                case 0xD: return codepoint <= 0xD7A3;
     273                default: return false;
     274        }
     275}
     276
     277/*  Is a given codepoint a legal Name character in XML 1.0 up to 4th edition? */
     278static inline bool is_XML10_NameChar_codepoint(const int codepoint) {
     279        switch (codepoint >> 12) {
     280                case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
     281                case 1: if (codepoint <= 0x11FF)
     282                                return bit_test(NameChar_XML10_0000_11FF, codepoint);
     283                        else if (codepoint < 0x1E00) return false;
     284                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     285                case 2: if (codepoint > 0x2182) return false;
     286                        else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
     287                case 3: if (codepoint > 0x312C) return false;
     288                        else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
     289                case 4: return codepoint >= 0x4E00;
     290                case 5: case 6: case 7: case 8: return true;
     291                case 9: return codepoint <= 0x9FA5;
     292                case 0xA:       return codepoint >= 0xAC00;
     293                case 0xB: case 0xC: return true;
     294                case 0xD: return codepoint <= 0xD7A3;
     295                default: return false;
     296        }
     297}
     298
     299/*  Is a given codepoint a legal NameStart character in XML 1.0 5e or XML 1.1? */
     300static inline bool is_XML11_NameStrt_codepoint(const int codepoint) {
     301        if (likely(codepoint) <= 0x03FF) return bit_test(NameStrt_XML11_0000_03FF, codepoint);
     302        else switch (codepoint >> 12) {
     303                case 0: case 1: return true;
     304                case 2: if (codepoint >= 0x2070)
     305                                if (codepoint <= 0x218F) return true;
     306                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     307                        else return (codepoint >= 0x200C) & (codepoint <= 0x200D);
     308                case 3: return codepoint >= 0x3001;
     309                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     310                case 0xD: return codepoint <= 0xD7FF;
     311                case 0xE: return false;
     312                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     313                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     314                default: return codepoint <= 0xEFFFF;
     315        }
     316}
     317
     318/*  Is a given codepoint a legal Name character in XML 1.0 5e or XML 1.1? */
     319static inline bool is_XML11_NameChar_codepoint(const int codepoint) {
     320        if (likely(codepoint) <= 0x03FF) return bit_test(NameChar_XML11_0000_03FF, codepoint);
     321        else switch (codepoint >> 12) {
     322                case 0: case 1: return true;
     323                case 2: if (codepoint >= 0x2070)
     324                                if (codepoint <= 0x218F) return true;
     325                                else return (codepoint >= 0x2C00) & (codepoint <= 0x2FEF);
     326                        else if (codepoint <= 0x200D) return codepoint >= 0x200C;
     327                        else return (codepoint == 0x203F) | (codepoint == 0x2040);
     328                case 3: return codepoint >= 0x3001;
     329                case 4: case 5: case 6: case 7: case 8: case 9: case 0xA: case 0xB: case 0xC: return true;
     330                case 0xD: return codepoint <= 0xD7FF;
     331                case 0xE: return false;
     332                case 0xF: if (codepoint <= 0xFDCF) return codepoint >= 0xF900;
     333                          else return (codepoint >= 0xFDF0) & (codepoint <= 0xFFFD);
     334                default: return codepoint <= 0xEFFFF;
     335        }
     336}
     337
     338/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     339    0 if the byte array does not begin with a valid character. 
     340    Use XML 1.0 rules up to 4th edition */
     341
     342static inline int XML_10_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     343        if (bytes[0] <= 0x7F) {
     344                if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
     345                else return 0;
     346        }
     347        else if (bytes[0] <= 0xDF) {
     348                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     349                if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
     350                else return 0;
     351        }
     352        else if (bytes[0] <= 0xEF) {
     353                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     354                return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
     355        }
     356        else return 0;
     357}
     358
     359/*  Return the number of UTF-8 bytes comprising a legal Name character,
     360    0 if the byte array does not begin with a valid character. 
     361    Use XML 1.0 rules up to 4th edition */
     362
     363static inline int XML_10_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     364        if (bytes[0] <= 0x7F) {
     365                if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
     366                else return 0;
     367        }
     368        else if (bytes[0] <= 0xDF) {
     369                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     370                if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
     371                else return 0;
     372        }
     373        else if (bytes[0] <= 0xEF) {
     374                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     375                return is_XML10_NameChar_codepoint(codepoint) ? 3 : 0;
     376        }
     377        else return 0;
     378}
     379
     380/*  Return the number of UTF-8 bytes comprising a legal NameStart character,
     381    0 if the byte array does not begin with a valid character. 
     382    Use rules for XML 5th edition/XML 1.1. */
     383
     384static inline int XML_11_UTF8_NameStrt_bytes (const unsigned char bytes[]) {
     385        if (bytes[0] <= 0x7F) {
     386                if (bit_test(NameStrt_XML11_0000_03FF, (int) bytes[0])) return 1;
     387                else return 0;
     388        }
     389        else if (bytes[0] <= 0xDF) {
     390                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     391                return is_XML11_NameStrt_codepoint(codepoint) ? 2 : 0;
     392        }
     393        else if (bytes[0] <= 0xEF) {
     394                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     395                return is_XML11_NameStrt_codepoint(codepoint) ? 3 : 0;
     396        }
     397        else {
     398                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     399                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     400                return is_XML11_NameStrt_codepoint(codepoint) ? 4 : 0;
     401        }
     402}
     403
     404/*  Return the number of UTF-8 bytes comprising a legal Name character,
     405    0 if the byte array does not begin with a valid character. 
     406    Use XML 1.0 rules up to 4th edition */
     407
     408static inline int XML_11_UTF8_NameChar_bytes (const unsigned char bytes[]) {
     409        if (bytes[0] <= 0x7F) {
     410                if (bit_test(NameChar_XML11_0000_03FF, (int) bytes[0])) return 1;
     411                else return 0;
     412        }
     413        else if (bytes[0] <= 0xDF) {
     414                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     415                return is_XML11_NameChar_codepoint(codepoint) ? 2 : 0;
     416        }
     417        else if (bytes[0] <= 0xEF) {
     418                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     419                return is_XML11_NameChar_codepoint(codepoint) ? 3 : 0;
     420        }
     421        else {
     422                int codepoint = ((bytes[0] & 0x0F) << 18)| ((bytes[1] & 0x3F) << 12) |
     423                                ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
     424                return is_XML11_NameChar_codepoint(codepoint) ? 4 : 0;
     425        }
     426}
     427
    257428#endif
Note: See TracChangeset for help on using the changeset viewer.