Changeset 129


Ignore:
Timestamp:
May 4, 2008, 5:17:12 AM (11 years ago)
Author:
cameron
Message:

UTF-8 validation.

Location:
trunk/src
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/bitlex.c

    r128 r129  
    545545
    546546
    547 /* Stub out Charset Validation initially. */
    548 
    549547void UTF_8_Lexer::Do_CharsetValidation() {
    550         //printf("UTF_8_Lexer::Do_CharsetValidation not yet implemented; assuming OK.\n");
     548        BitBlock u8prefix, u8suffix, u8prefix2, u8prefix3or4, u8prefix3, u8prefix4;
     549        BitBlock error_mask;
     550        /*  UTF-8 sequences may cross block boundaries.  If a
     551            prefix is found near the end of a block that requires
     552            one or more suffixes in the next block, then
     553            prefix_pending is set to mark the positions.
     554            However, at the beginning of the buffer, no suffixes
     555            are expected, so this value is initialized to zeroes. */
     556        BitBlock prefix_pending = simd_const_1(0);
     557        /*  If a suffix is pending, then it may involve one of
     558            the special case prefixes E0, ED. F0, F4. */
     559        BitBlock E0ED_pending = simd_const_1(0);
     560        BitBlock F0F4_pending = simd_const_1(0);
     561        BitBlock bit5_pending = simd_const_1(0);
     562
     563        /* Temporary variables used within the block. */
     564        BitBlock suffix_required_scope;
     565        BitBlock prefix_E0ED, E0ED_scope, bit5_scope, E0ED_constraint; 
     566        BitBlock prefix_F5FF, prefix_F0F4, F0F4_scope, F0F4_constraint;
     567
    551568        for (int i = 0; i < buffer_blocks; i++) {
    552569                validation_stream[i] = simd_const_1(0);
     570                /* If there is no pending suffix and no bit 0, then there
     571                   are no possible validation issues for this block. */
     572                if (!bitblock_has_bit(simd_or(prefix_pending, x8basis[i].bit[0])))
     573                        continue;
     574                /*  Compute classifications of UTF-8 bytes. */
     575                u8prefix = simd_and(x8basis[i].bit[0], x8basis[i].bit[1]);
     576                u8suffix = simd_andc(x8basis[i].bit[0], x8basis[i].bit[1]);
     577                u8prefix3or4 = simd_and(u8prefix, x8basis[i].bit[2]);
     578                u8prefix2 = simd_andc(u8prefix, x8basis[i].bit[2]);
     579                u8prefix3 = simd_andc(u8prefix3or4, x8basis[i].bit[3]);
     580                u8prefix4 = simd_and(u8prefix3or4, x8basis[i].bit[3]);
     581
     582                /*  Initiate validation for two-byte sequences. */
     583                error_mask = simd_andc(u8prefix2,
     584                                        simd_or(simd_or(x8basis[i].bit[3], x8basis[i].bit[4]),
     585                                                simd_or(x8basis[i].bit[5], x8basis[i].bit[6])));
     586                suffix_required_scope = simd_or(prefix_pending, sisd_sfli(u8prefix, 1));
     587
     588                prefix_pending = sisd_sbli(u8prefix, BLOCKSIZE - 1);
     589                E0ED_scope = E0ED_pending;
     590                F0F4_scope = F0F4_pending;
     591                bit5_scope = bit5_pending;
     592                E0ED_pending = simd_const_1(0);
     593                F0F4_pending = simd_const_1(0);
     594                bit5_pending = simd_const_1(0);
     595
     596                if (bitblock_has_bit(u8prefix3or4)) {
     597                        /*  Extend validation for errors in three-byte sequences. */
     598                        suffix_required_scope = simd_or(suffix_required_scope,
     599                                                        sisd_sfli(u8prefix3or4, 2));
     600                        prefix_pending = simd_or(prefix_pending,
     601                                                 sisd_sbli(u8prefix3or4, BLOCKSIZE - 2));
     602                        bit5_scope = simd_or(bit5_scope, sisd_sfli(x8basis[i].bit[5], 1));
     603                        bit5_pending = sisd_sbli(x8basis[i].bit[5], BLOCKSIZE - 1);
     604                        prefix_E0ED = simd_andc(u8prefix3,
     605                                                simd_or(simd_or(x8basis[i].bit[6],
     606                                                                simd_xor(x8basis[i].bit[4], x8basis[i].bit[7])),
     607                                                        simd_xor(x8basis[i].bit[4], x8basis[i].bit[5])));
     608                        E0ED_scope = simd_or(E0ED_scope, sisd_sfli(prefix_E0ED, 1));
     609                        E0ED_pending = sisd_sbli(prefix_E0ED, BLOCKSIZE - 1);
     610                        if (bitblock_has_bit(u8prefix4)) {
     611                                /*  Extend validation for errors in four-byte sequences. */
     612                                suffix_required_scope = simd_or(suffix_required_scope,
     613                                                                sisd_sfli(u8prefix4, 3));
     614                                prefix_pending = simd_or(prefix_pending,
     615                                                         sisd_sbli(u8prefix4, BLOCKSIZE - 3));
     616                                prefix_F5FF = simd_and(u8prefix4,
     617                                                       simd_or(x8basis[i].bit[4],
     618                                                                simd_and(x8basis[i].bit[5],
     619                                                                        simd_or(x8basis[i].bit[6], x8basis[i].bit[7]))));
     620                                error_mask = simd_or(error_mask, prefix_F5FF);
     621                                prefix_F0F4 = simd_andc(u8prefix4,
     622                                                        simd_or(x8basis[i].bit[4],
     623                                                                simd_or(x8basis[i].bit[6], x8basis[i].bit[7])));
     624                                F0F4_scope = simd_or(F0F4_scope, sisd_sfli(prefix_F0F4, 1));
     625                                F0F4_pending = sisd_sbli(prefix_F0F4, BLOCKSIZE - 1);
     626                        }
     627                }
     628                E0ED_constraint = simd_xor(bit5_scope, x8basis[i].bit[2]);
     629                error_mask = simd_or(error_mask, simd_andc(E0ED_scope, E0ED_constraint));
     630#ifdef DEBUG_UTF8_VALIDATION
     631                print_bit_block("error_mask at E0ED", error_mask);
     632#endif
     633                F0F4_constraint = simd_xor(bit5_scope,
     634                                           simd_or(x8basis[i].bit[2], x8basis[i].bit[3]));
     635                error_mask = simd_or(error_mask, simd_andc(F0F4_scope, F0F4_constraint));
     636#ifdef DEBUG_UTF8_VALIDATION
     637                print_bit_block("error_mask at F0F4", error_mask);
     638#endif
     639                /*  Complete validation by checking for prefix-suffix mismatches. */
     640                error_mask = simd_or(error_mask, simd_xor(suffix_required_scope, u8suffix));
     641#ifdef DEBUG_UTF8_VALIDATION
     642                print_bit_block("error_mask at suffix_match", error_mask);
     643#endif
     644                validation_stream[i] = error_mask;
     645#ifdef DEBUG_UTF8_VALIDATION
     646//              if (bitblock_has_bit(error_mask)) {
     647printf("-%i----------------------\n", i);
     648print_bit_block("x8basis[i].bit[0]", x8basis[i].bit[0]);
     649print_bit_block("x8basis[i].bit[1]", x8basis[i].bit[1]);
     650print_bit_block("x8basis[i].bit[2]", x8basis[i].bit[2]);
     651print_bit_block("x8basis[i].bit[3]", x8basis[i].bit[3]);
     652        print_bit_block("u8prefix2", u8prefix2);
     653        print_bit_block("u8prefix3", u8prefix3);
     654        print_bit_block("u8prefix4", u8prefix4);
     655        print_bit_block("suffix_required_scope", suffix_required_scope);
     656        print_bit_block("prefix_pending", prefix_pending);
     657        print_bit_block("E0ED_pending", E0ED_pending);
     658        print_bit_block("F0F4_pending", F0F4_pending);
     659        print_bit_block("bit5_pending", bit5_pending);
     660                print_bit_block("error_mask", error_mask);
     661
     662//}
     663#endif
    553664        }
    554665};
     
    860971
    861972
    862 void Lexer_Interface::AnalyzeBuffer(BitBlockBasis * basis, int base_pos, int buffer_limit_pos) {
     973void Lexer_Interface::AnalyzeBuffer(BitBlockBasis * basis, int base_pos, int start_pos, int buffer_limit_pos) {
    863974#ifdef DEBUG
    864975        printf("Entered AnalyzeBuffer, buffer_limit_pos = %i\n", buffer_limit_pos);
     
    867978        lexer_base_pos = base_pos; /* for error reporting. */
    868979        int err_pos;
     980        buffer_blocks = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
     981        buffer_units = buffer_limit_pos;
     982#ifdef CODE_CLOCKING
     983/*
     984        start_Interval(char_validation_clocker);
     985*/
     986#endif
     987        Do_CharsetValidation();
     988        /* Ignore error bits before start_pos which only arise
     989           due to UTF8 pending scope streams at buffer boundaries.*/
     990        err_pos = bitstream_scan(validation_stream, start_pos);
     991        /* Detect validation errors up to the end of file plus one more
     992           position in case there is an incomplete code unit at EOF. */
     993        if ((err_pos <= buffer_units) && (err_pos < BUFFER_SIZE)) {
     994//              printf("start_pos =%i\n, err_pos = %i\n", start_pos, err_pos);
     995//              print_bit_block("validation_stream[0]", validation_stream[0]);
     996
     997//              print_bit_block("validation_stream[err_pos/128]", validation_stream[err_pos/128]);
     998
     999                CharSetValidationError((char *) entity_Info->encoding, lexer_base_pos + err_pos);
     1000        }
     1001#ifdef CODE_CLOCKING
     1002/*
     1003        end_Interval(char_validation_clocker, buffer_blocks * BLOCKSIZE);
     1004*/
     1005#endif
    8691006#ifdef CODE_CLOCKING
    8701007/*
     
    8721009*/
    8731010#endif
    874         buffer_blocks = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
    875         buffer_units = buffer_limit_pos;
    8761011
    8771012        if (entity_Info->version == XML_1_1) Do_XML_11_WS_Control();
     
    8991034/*
    9001035        end_Interval(MarkupStreams_clocker, BUFFER_SIZE);
    901 */
    902 #endif
    903 #ifdef CODE_CLOCKING
    904 /*
    905         start_Interval(char_validation_clocker);
    906 */
    907 #endif
    908         Do_CharsetValidation();
    909         err_pos = bitstream_scan0(validation_stream);
    910         if (err_pos < buffer_units) {
    911                 CharSetValidationError((char *) entity_Info->encoding, lexer_base_pos + err_pos);
    912         }
    913 #ifdef CODE_CLOCKING
    914 /*
    915         end_Interval(char_validation_clocker, buffer_blocks * BLOCKSIZE);
    9161036*/
    9171037#endif
  • trunk/src/bitlex.h

    r128 r129  
    6464        Lexer_Interface(Entity_Info * e, LexicalStreamSet *l);
    6565        ~Lexer_Interface();
    66         void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int buffer_limit_pos);
     66        void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int start_pos, int buffer_limit_pos);
    6767
    6868protected:
  • trunk/src/engine.c

    r128 r129  
    172172        lexer = Lexer<C>::LexerFactory(e, buf);
    173173        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
    174         lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_limit_pos);
     174        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
    175175}
    176176
     
    188188template <CodeUnit_Base C>
    189189inline void ParsingEngine<C>::AdvanceBuffers(int preserve_pos){
     190
    190191        int advance_amt = min(preserve_pos, text_or_markup_start) - buffer_base_pos;
    191192        advance_amt &= -PACKSIZE; // maintain alignment
     
    198199        byteplex->PreparePseudoASCII_Stream();
    199200        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
    200         lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_limit_pos);
     201        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
    201202}
    202203
     
    250251                                      buffer_rel_pos);
    251252  while (buffer_rel_pos >= BUFFER_SIZE) {
     253        while (at_UTF8_suffix(cur())) buffer_rel_pos--;
    252254        FinalizeBuffer_action(preserve_pos);
    253255        AdvanceBuffers(preserve_pos);
     
    257259#endif
    258260
     261inline bool at_UTF8_suffix(unsigned char x8data[]) {
     262        unsigned char code_unit = x8data[0];
     263        return ((code_unit >= 0x80) & (code_unit < 0xC0));
     264}
     265
    259266template <CodeUnit_Base C>
    260267inline void ParsingEngine<C>::ScanToMarkupStart() {
     
    262269        text_or_markup_start = AbsPos();
    263270        buffer_rel_pos = bitstream_scan(buf->item_stream[MarkupStart], buffer_rel_pos);
    264         while (buffer_rel_pos >= BUFFER_SIZE) {
     271        while (buffer_rel_pos >= BUFFER_SIZE) {
     272                while (at_UTF8_suffix(cur())) buffer_rel_pos--;
    265273                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
    266274                text_or_markup_start = AbsPos();
     
    276284        buffer_rel_pos = bitstream_scan(buf->item_stream[CD_End_check], buffer_rel_pos);
    277285        while (buffer_rel_pos >= BUFFER_SIZE) {
     286                while (at_UTF8_suffix(cur())) buffer_rel_pos--;
    278287                Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start));
    279288                text_or_markup_start = AbsPos();
     
    298307    while (buffer_rel_pos >= BUFFER_BLOCKS * BLOCKSIZE) {
    299308      buffer_rel_pos = BUFFER_BLOCKS * BLOCKSIZE;
     309      while (at_UTF8_suffix(cur())) buffer_rel_pos--;
    300310      FinalizeBuffer_action(preserve_pos);
    301311          AdvanceBuffers(preserve_pos);
Note: See TracChangeset for help on using the changeset viewer.