Legend:
- Unmodified
- Added
- Removed
-
trunk/src/bitlex.c
r128 r129 545 545 546 546 547 /* Stub out Charset Validation initially. */548 549 547 void UTF_8_Lexer::Do_CharsetValidation() { 550 //printf("UTF_8_Lexer::Do_CharsetValidation not yet implemented; assuming OK.\n"); 548 BitBlock u8prefix, u8suffix, u8prefix2, u8prefix3or4, u8prefix3, u8prefix4; 549 BitBlock error_mask; 550 /* UTF-8 sequences may cross block boundaries. If a 551 prefix is found near the end of a block that requires 552 one or more suffixes in the next block, then 553 prefix_pending is set to mark the positions. 554 However, at the beginning of the buffer, no suffixes 555 are expected, so this value is initialized to zeroes. */ 556 BitBlock prefix_pending = simd_const_1(0); 557 /* If a suffix is pending, then it may involve one of 558 the special case prefixes E0, ED. F0, F4. */ 559 BitBlock E0ED_pending = simd_const_1(0); 560 BitBlock F0F4_pending = simd_const_1(0); 561 BitBlock bit5_pending = simd_const_1(0); 562 563 /* Temporary variables used within the block. */ 564 BitBlock suffix_required_scope; 565 BitBlock prefix_E0ED, E0ED_scope, bit5_scope, E0ED_constraint; 566 BitBlock prefix_F5FF, prefix_F0F4, F0F4_scope, F0F4_constraint; 567 551 568 for (int i = 0; i < buffer_blocks; i++) { 552 569 validation_stream[i] = simd_const_1(0); 570 /* If there is no pending suffix and no bit 0, then there 571 are no possible validation issues for this block. */ 572 if (!bitblock_has_bit(simd_or(prefix_pending, x8basis[i].bit[0]))) 573 continue; 574 /* Compute classifications of UTF-8 bytes. */ 575 u8prefix = simd_and(x8basis[i].bit[0], x8basis[i].bit[1]); 576 u8suffix = simd_andc(x8basis[i].bit[0], x8basis[i].bit[1]); 577 u8prefix3or4 = simd_and(u8prefix, x8basis[i].bit[2]); 578 u8prefix2 = simd_andc(u8prefix, x8basis[i].bit[2]); 579 u8prefix3 = simd_andc(u8prefix3or4, x8basis[i].bit[3]); 580 u8prefix4 = simd_and(u8prefix3or4, x8basis[i].bit[3]); 581 582 /* Initiate validation for two-byte sequences. */ 583 error_mask = simd_andc(u8prefix2, 584 simd_or(simd_or(x8basis[i].bit[3], x8basis[i].bit[4]), 585 simd_or(x8basis[i].bit[5], x8basis[i].bit[6]))); 586 suffix_required_scope = simd_or(prefix_pending, sisd_sfli(u8prefix, 1)); 587 588 prefix_pending = sisd_sbli(u8prefix, BLOCKSIZE - 1); 589 E0ED_scope = E0ED_pending; 590 F0F4_scope = F0F4_pending; 591 bit5_scope = bit5_pending; 592 E0ED_pending = simd_const_1(0); 593 F0F4_pending = simd_const_1(0); 594 bit5_pending = simd_const_1(0); 595 596 if (bitblock_has_bit(u8prefix3or4)) { 597 /* Extend validation for errors in three-byte sequences. */ 598 suffix_required_scope = simd_or(suffix_required_scope, 599 sisd_sfli(u8prefix3or4, 2)); 600 prefix_pending = simd_or(prefix_pending, 601 sisd_sbli(u8prefix3or4, BLOCKSIZE - 2)); 602 bit5_scope = simd_or(bit5_scope, sisd_sfli(x8basis[i].bit[5], 1)); 603 bit5_pending = sisd_sbli(x8basis[i].bit[5], BLOCKSIZE - 1); 604 prefix_E0ED = simd_andc(u8prefix3, 605 simd_or(simd_or(x8basis[i].bit[6], 606 simd_xor(x8basis[i].bit[4], x8basis[i].bit[7])), 607 simd_xor(x8basis[i].bit[4], x8basis[i].bit[5]))); 608 E0ED_scope = simd_or(E0ED_scope, sisd_sfli(prefix_E0ED, 1)); 609 E0ED_pending = sisd_sbli(prefix_E0ED, BLOCKSIZE - 1); 610 if (bitblock_has_bit(u8prefix4)) { 611 /* Extend validation for errors in four-byte sequences. */ 612 suffix_required_scope = simd_or(suffix_required_scope, 613 sisd_sfli(u8prefix4, 3)); 614 prefix_pending = simd_or(prefix_pending, 615 sisd_sbli(u8prefix4, BLOCKSIZE - 3)); 616 prefix_F5FF = simd_and(u8prefix4, 617 simd_or(x8basis[i].bit[4], 618 simd_and(x8basis[i].bit[5], 619 simd_or(x8basis[i].bit[6], x8basis[i].bit[7])))); 620 error_mask = simd_or(error_mask, prefix_F5FF); 621 prefix_F0F4 = simd_andc(u8prefix4, 622 simd_or(x8basis[i].bit[4], 623 simd_or(x8basis[i].bit[6], x8basis[i].bit[7]))); 624 F0F4_scope = simd_or(F0F4_scope, sisd_sfli(prefix_F0F4, 1)); 625 F0F4_pending = sisd_sbli(prefix_F0F4, BLOCKSIZE - 1); 626 } 627 } 628 E0ED_constraint = simd_xor(bit5_scope, x8basis[i].bit[2]); 629 error_mask = simd_or(error_mask, simd_andc(E0ED_scope, E0ED_constraint)); 630 #ifdef DEBUG_UTF8_VALIDATION 631 print_bit_block("error_mask at E0ED", error_mask); 632 #endif 633 F0F4_constraint = simd_xor(bit5_scope, 634 simd_or(x8basis[i].bit[2], x8basis[i].bit[3])); 635 error_mask = simd_or(error_mask, simd_andc(F0F4_scope, F0F4_constraint)); 636 #ifdef DEBUG_UTF8_VALIDATION 637 print_bit_block("error_mask at F0F4", error_mask); 638 #endif 639 /* Complete validation by checking for prefix-suffix mismatches. */ 640 error_mask = simd_or(error_mask, simd_xor(suffix_required_scope, u8suffix)); 641 #ifdef DEBUG_UTF8_VALIDATION 642 print_bit_block("error_mask at suffix_match", error_mask); 643 #endif 644 validation_stream[i] = error_mask; 645 #ifdef DEBUG_UTF8_VALIDATION 646 // if (bitblock_has_bit(error_mask)) { 647 printf("-%i----------------------\n", i); 648 print_bit_block("x8basis[i].bit[0]", x8basis[i].bit[0]); 649 print_bit_block("x8basis[i].bit[1]", x8basis[i].bit[1]); 650 print_bit_block("x8basis[i].bit[2]", x8basis[i].bit[2]); 651 print_bit_block("x8basis[i].bit[3]", x8basis[i].bit[3]); 652 print_bit_block("u8prefix2", u8prefix2); 653 print_bit_block("u8prefix3", u8prefix3); 654 print_bit_block("u8prefix4", u8prefix4); 655 print_bit_block("suffix_required_scope", suffix_required_scope); 656 print_bit_block("prefix_pending", prefix_pending); 657 print_bit_block("E0ED_pending", E0ED_pending); 658 print_bit_block("F0F4_pending", F0F4_pending); 659 print_bit_block("bit5_pending", bit5_pending); 660 print_bit_block("error_mask", error_mask); 661 662 //} 663 #endif 553 664 } 554 665 }; … … 860 971 861 972 862 void Lexer_Interface::AnalyzeBuffer(BitBlockBasis * basis, int base_pos, int buffer_limit_pos) {973 void Lexer_Interface::AnalyzeBuffer(BitBlockBasis * basis, int base_pos, int start_pos, int buffer_limit_pos) { 863 974 #ifdef DEBUG 864 975 printf("Entered AnalyzeBuffer, buffer_limit_pos = %i\n", buffer_limit_pos); … … 867 978 lexer_base_pos = base_pos; /* for error reporting. */ 868 979 int err_pos; 980 buffer_blocks = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE; 981 buffer_units = buffer_limit_pos; 982 #ifdef CODE_CLOCKING 983 /* 984 start_Interval(char_validation_clocker); 985 */ 986 #endif 987 Do_CharsetValidation(); 988 /* Ignore error bits before start_pos which only arise 989 due to UTF8 pending scope streams at buffer boundaries.*/ 990 err_pos = bitstream_scan(validation_stream, start_pos); 991 /* Detect validation errors up to the end of file plus one more 992 position in case there is an incomplete code unit at EOF. */ 993 if ((err_pos <= buffer_units) && (err_pos < BUFFER_SIZE)) { 994 // printf("start_pos =%i\n, err_pos = %i\n", start_pos, err_pos); 995 // print_bit_block("validation_stream[0]", validation_stream[0]); 996 997 // print_bit_block("validation_stream[err_pos/128]", validation_stream[err_pos/128]); 998 999 CharSetValidationError((char *) entity_Info->encoding, lexer_base_pos + err_pos); 1000 } 1001 #ifdef CODE_CLOCKING 1002 /* 1003 end_Interval(char_validation_clocker, buffer_blocks * BLOCKSIZE); 1004 */ 1005 #endif 869 1006 #ifdef CODE_CLOCKING 870 1007 /* … … 872 1009 */ 873 1010 #endif 874 buffer_blocks = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;875 buffer_units = buffer_limit_pos;876 1011 877 1012 if (entity_Info->version == XML_1_1) Do_XML_11_WS_Control(); … … 899 1034 /* 900 1035 end_Interval(MarkupStreams_clocker, BUFFER_SIZE); 901 */902 #endif903 #ifdef CODE_CLOCKING904 /*905 start_Interval(char_validation_clocker);906 */907 #endif908 Do_CharsetValidation();909 err_pos = bitstream_scan0(validation_stream);910 if (err_pos < buffer_units) {911 CharSetValidationError((char *) entity_Info->encoding, lexer_base_pos + err_pos);912 }913 #ifdef CODE_CLOCKING914 /*915 end_Interval(char_validation_clocker, buffer_blocks * BLOCKSIZE);916 1036 */ 917 1037 #endif -
trunk/src/bitlex.h
r128 r129 64 64 Lexer_Interface(Entity_Info * e, LexicalStreamSet *l); 65 65 ~Lexer_Interface(); 66 void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int buffer_limit_pos);66 void AnalyzeBuffer(BitBlockBasis * x8basis, int base_pos, int start_pos, int buffer_limit_pos); 67 67 68 68 protected: -
trunk/src/engine.c
r128 r129 172 172 lexer = Lexer<C>::LexerFactory(e, buf); 173 173 bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer); 174 lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_ limit_pos);174 lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos); 175 175 } 176 176 … … 188 188 template <CodeUnit_Base C> 189 189 inline void ParsingEngine<C>::AdvanceBuffers(int preserve_pos){ 190 190 191 int advance_amt = min(preserve_pos, text_or_markup_start) - buffer_base_pos; 191 192 advance_amt &= -PACKSIZE; // maintain alignment … … 198 199 byteplex->PreparePseudoASCII_Stream(); 199 200 bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer); 200 lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_ limit_pos);201 lexer->AnalyzeBuffer(bitplex->x8basis, buffer_base_pos, buffer_rel_pos, buffer_limit_pos); 201 202 } 202 203 … … 250 251 buffer_rel_pos); 251 252 while (buffer_rel_pos >= BUFFER_SIZE) { 253 while (at_UTF8_suffix(cur())) buffer_rel_pos--; 252 254 FinalizeBuffer_action(preserve_pos); 253 255 AdvanceBuffers(preserve_pos); … … 257 259 #endif 258 260 261 inline bool at_UTF8_suffix(unsigned char x8data[]) { 262 unsigned char code_unit = x8data[0]; 263 return ((code_unit >= 0x80) & (code_unit < 0xC0)); 264 } 265 259 266 template <CodeUnit_Base C> 260 267 inline void ParsingEngine<C>::ScanToMarkupStart() { … … 262 269 text_or_markup_start = AbsPos(); 263 270 buffer_rel_pos = bitstream_scan(buf->item_stream[MarkupStart], buffer_rel_pos); 264 while (buffer_rel_pos >= BUFFER_SIZE) { 271 while (buffer_rel_pos >= BUFFER_SIZE) { 272 while (at_UTF8_suffix(cur())) buffer_rel_pos--; 265 273 Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start)); 266 274 text_or_markup_start = AbsPos(); … … 276 284 buffer_rel_pos = bitstream_scan(buf->item_stream[CD_End_check], buffer_rel_pos); 277 285 while (buffer_rel_pos >= BUFFER_SIZE) { 286 while (at_UTF8_suffix(cur())) buffer_rel_pos--; 278 287 Text_action(GetCodeUnitPtr(text_or_markup_start), LengthFrom(text_or_markup_start)); 279 288 text_or_markup_start = AbsPos(); … … 298 307 while (buffer_rel_pos >= BUFFER_BLOCKS * BLOCKSIZE) { 299 308 buffer_rel_pos = BUFFER_BLOCKS * BLOCKSIZE; 309 while (at_UTF8_suffix(cur())) buffer_rel_pos--; 300 310 FinalizeBuffer_action(preserve_pos); 301 311 AdvanceBuffers(preserve_pos);
Note: See TracChangeset
for help on using the changeset viewer.