Changeset 518 for proto/parabix2


Ignore:
Timestamp:
Jul 19, 2010, 4:44:03 PM (9 years ago)
Author:
lindanl
Message:

Name Validation

Location:
proto/parabix2
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2_compilable.py

    r511 r518  
    657657        # Mark any occurrences of null names as errors.
    658658        ParseError = ElemNamePositions & ElemNameFollows
    659         #callouts.ElemNames = ElemNameFollows - ElemNamePositions
     659        callouts.ElemNames = ElemNameFollows - ElemNamePositions
    660660       
    661661        # Initialize the accumulators for attribute name and value positions.
     
    702702        # No more attribute values to process when AttNameStart == 0.
    703703        # Not needed for xmlwf
    704         #callouts.AttNames = AttNameFollows - AttNameStarts
     704        callouts.AttNames = AttNameFollows - AttNameStarts
    705705        #callouts.AttVals = AttValFollows - AttValStarts
    706706        STagEnds = AttListEnd & lex.RAngle
     
    753753
    754754        #return (CT_callouts, callouts, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
    755         return (CT_callouts, callouts, refs, delmask, error, lex, EOF_mask)
     755        return (CT_callouts, callouts, refs, delmask, error, lex, EOF_mask, name_check, name_start_check)
    756756
    757757def demo_parabix(u8data):
  • proto/parabix2/template.c

    r516 r518  
    1313#include "xml_error.c"
    1414#include "xmldecl.c"
     15#include "namechars.h"
    1516
    1617#include "../lib/perflib/perfsec.h"
     
    2930        void * parser_timer;
    3031#endif
     32
     33inline bool bit_test(unsigned char * bit_Map, int codepoint) {
     34        return (bit_Map[codepoint/8] >> (7 - codepoint % 8)) & 1;
     35}
     36
     37bool is_XML10_NameStrt_codepoint(int codepoint) {
     38        switch (codepoint >> 12) {
     39                case 0: return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     40                case 1: if (codepoint <= 0x11FF)
     41                                return bit_test(NameStrt_XML10_0000_11FF, codepoint);
     42                        else if (codepoint < 0x1E00) return false;
     43                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     44                case 2: if (codepoint > 0x2182) return false;
     45                        else return bit_test(NameStrt_XML10_2000_21FF, codepoint & 0x1FF);
     46                case 3: if (codepoint > 0x312C) return false;
     47                        else return bit_test(NameStrt_XML10_3000_31FF, codepoint & 0x1FF);
     48                case 4: return codepoint >= 0x4E00;
     49                case 5: case 6: case 7: case 8: return true;
     50                case 9: return codepoint <= 0x9FA5;
     51                case 0xA: return codepoint >= 0xAC00;
     52                case 0xB: case 0xC: return true;
     53                case 0xD: return codepoint <= 0xD7A3;
     54                default: return false;
     55        }
     56}
     57
     58bool is_XML10_NameChar_codepoint(int codepoint) {
     59        switch (codepoint >> 12) {
     60                case 0: return bit_test(NameChar_XML10_0000_11FF, codepoint);
     61                case 1: if (codepoint <= 0x11FF)
     62                                return bit_test(NameChar_XML10_0000_11FF, codepoint);
     63                        else if (codepoint < 0x1E00) return false;
     64                        else return bit_test(NameStrt_XML10_1E00_1FFF, codepoint & 0x1FF);
     65                case 2: if (codepoint > 0x2182) return false;
     66                        else return bit_test(NameChar_XML10_2000_21FF, codepoint & 0x1FF);
     67                case 3: if (codepoint > 0x312C) return false;
     68                        else return bit_test(NameChar_XML10_3000_31FF, codepoint & 0x1FF);
     69                case 4: return codepoint >= 0x4E00;
     70                case 5: case 6: case 7: case 8: return true;
     71                case 9: return codepoint <= 0x9FA5;
     72                case 0xA:       return codepoint >= 0xAC00;
     73                case 0xB: case 0xC: return true;
     74                case 0xD: return codepoint <= 0xD7A3;
     75                default: return false;
     76        }
     77}
     78
     79inline int XML_10_UTF8_NameStrt_bytes (unsigned char bytes[]) {
     80        if (bytes[0] <= 0x7F) {
     81                if (bit_test(NameStrt_XML10_0000_11FF, (int) bytes[0])) return 1;
     82                else return 0;
     83        }
     84        else if (bytes[0] <= 0xDF) {
     85                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     86                if (bit_test(NameStrt_XML10_0000_11FF, codepoint)) return 2;
     87                else return 0;
     88        }
     89        else if (bytes[0] <= 0xEF) {
     90                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     91                return is_XML10_NameStrt_codepoint(codepoint) ? 3 : 0;
     92        }
     93        else return 0;
     94}
     95
     96inline int XML_10_UTF8_NameChar_bytes (unsigned char bytes[]) {
     97        if (bytes[0] <= 0x7F) {
     98                if (bit_test(NameChar_XML10_0000_11FF, (int) bytes[0])) return 1;
     99                else return 0;
     100        }
     101        else if (bytes[0] <= 0xDF) {
     102                int codepoint = ((bytes[0] & 0x3F) << 6) | (bytes[1] & 0x3F);
     103                if (bit_test(NameChar_XML10_0000_11FF, codepoint)) return 2;
     104                else return 0;
     105        }
     106        else if (bytes[0] <= 0xEF) {
     107                int codepoint = ((bytes[0] & 0x0F) << 12)| ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
     108                return is_XML10_NameChar_codepoint(codepoint) ? 3 : 0;
     109        }
     110        else return 0;
     111}
    31112
    32113#define s2p_step(s0,s1,hi_mask,shift,p0,p1)  \
     
    100181  int errpos = 0;
    101182  int chars_read = 0;
     183  int check_pos = 0;
    102184  char srcbuf[BUFFER_SIZE+BLOCK_SIZE];
    103185 
     
    163245        }
    164246       
     247        if (bitblock_has_bit(simd_or(name_check,name_start_check))) {
     248          if(bitblock_has_bit(name_start_check)){
     249            check_pos = block_pos + count_forward_zeroes(name_start_check);
     250            if(XML_10_UTF8_NameStrt_bytes((unsigned char*)&srcbuf[check_pos]) == 0)
     251              fprintf(stderr, "name start error found at position %i\n",check_pos+buf_pos);
     252          }
     253          if(bitblock_has_bit(name_check)){
     254            check_pos = block_pos + count_forward_zeroes(name_start_check);
     255            if(XML_10_UTF8_NameStrt_bytes((unsigned char*)&srcbuf[check_pos]) == 0)
     256              fprintf(stderr, "name error found at position %i\n",check_pos+buf_pos);
     257          }
     258        }
     259       
    165260        block_pos += BLOCK_SIZE;
    166261      }
     
    190285        }
    191286       
     287        if (bitblock_has_bit(simd_or(name_check,name_start_check))) {
     288          if(bitblock_has_bit(name_start_check)){
     289            check_pos = block_pos + count_forward_zeroes(name_start_check);
     290            if(XML_10_UTF8_NameStrt_bytes((unsigned char*)&srcbuf[check_pos]) == 0)
     291              fprintf(stderr, "name start error found at position %i\n",check_pos+buf_pos);
     292          }
     293          if(bitblock_has_bit(name_check)){
     294            check_pos = block_pos + count_forward_zeroes(name_start_check);
     295            if(XML_10_UTF8_NameStrt_bytes((unsigned char*)&srcbuf[check_pos]) == 0)
     296              fprintf(stderr, "name error found at position %i\n",check_pos+buf_pos);
     297          }
     298        }
     299       
    192300        block_pos += BLOCK_SIZE;
    193301      }
Note: See TracChangeset for help on using the changeset viewer.