Changeset 15 for trunk/src/engine.c


Ignore:
Timestamp:
Jan 11, 2008, 6:16:25 AM (11 years ago)
Author:
cameron
Message:

Bytespace scanning in XML declarations; various updates

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/engine.c

    r14 r15  
    4343
    4444void ParsingEngine::InitLexer() {
    45   unsigned char * XML_signature = xml_buf->GetBytePtr(0);
    46   if (xml_buf->PrepareBytes(4) < 4) {
     45  unsigned char * sentinel = (unsigned char *) "<![--?>]]>/'>\"><!";
     46  if (xml_buf->PrepareBytes(0, 4) < 4) {
    4747    printf("No XML input document.\n");
    4848    exit(-1);
    4949  }
     50  unsigned char * XML_signature = xml_buf->BytePtr(0);
     51  xml_buf->InstallPadding(sentinel);
     52
    5053  Charset_Family family = Charset_Family_Detect(XML_signature);
    5154  switch (family) {
     
    6871      exit(-1);
    6972  }
    70   rel_EOF_pos = lex-> AdvanceBuffer(0);
     73  avail_code_units = lex-> AdvanceBuffer(0);
    7174}
    7275
     
    7780}
    7881
     82inline unsigned char ParsingEngine::CharAt(int offset) const {
     83  return ((unsigned char *) buf.x8data)[buffer_rel_pos+offset];
     84}
     85
    7986inline unsigned char * ParsingEngine::x8dataPtr(int offset) const {
    8087  return &((unsigned char *) buf.x8data)[buffer_rel_pos+offset];
     
    9198}
    9299
     100
    93101inline bool ParsingEngine::at_EOF () const {
    94   return buffer_rel_pos >= rel_EOF_pos;
     102  return (buffer_rel_pos >= avail_code_units) &&
     103         (avail_code_units < BUFFER_BLOCKS * BLOCKSIZE + LOOKAHEAD_POSITIONS);
    95104}
    96105
     
    104113#endif
    105114    AdvanceToNewBasePosn(buffer_rel_pos);
    106     rel_EOF_pos = lex->AdvanceBuffer(AbsPos());
    107   }
     115    avail_code_units = lex->AdvanceBuffer(AbsPos());
     116  }
     117#endif
     118}
     119
     120inline void ParsingEngine::ASCII_ScanTo(int item) {
     121#ifdef DEBUG_BYTESCAN
     122  int p1 = AbsPos();
     123#endif
     124  switch (item) {
     125    case NonWS: while (ASCII_WS_at(0)) Advance(1); break;
     126    case MarkupStart: while(!AtChar('<') && !AtChar('&') && !at_CDATA_End()) Advance(1); break;
     127    case CD_End_check: while(!at_CDATA_End()) Advance(1); break;
     128    case Hyphen: while(!AtChar('-')) Advance(1); break;
     129    case QMark: while(!AtChar('?')) Advance(1); break;
     130    case DQuote: while(!AtChar('<') && !AtChar('&') && !AtChar('"')) Advance(1); break;
     131    case SQuote: while(!AtChar('<') && !AtChar('&') && !AtChar('\'')) Advance(1); break;
     132    case NameFollow: while(!ASCII_WS_at(0) && !AtChar(';') && !AtChar('/') && !AtChar('>')
     133                      && !AtChar('=') && !AtChar('?')) Advance(1); break;
     134  }
     135#ifdef DEBUG_BYTESCAN
     136  printf("ASCII_ScanTo(%i) %i -> %i\n", item, p1, AbsPos());
    108137#endif
    109138}
    110139
    111140#ifndef OPTIMIZE_SHORT_SCAN
     141#ifdef BYTESPACE_SCAN
     142inline void ParsingEngine::ScanTo(int item) {
     143  ASCII_ScanTo(item);
     144}
     145#endif
     146
     147#ifndef BYTESPACE_SCAN
    112148inline void ParsingEngine::ScanTo(int item) {
    113149  buffer_rel_pos = bitstream_scan(buf.item_stream[item],
     
    119155#endif
    120156    AdvanceToNewBasePosn(buffer_rel_pos);
    121     rel_EOF_pos = lex->AdvanceBuffer(AbsPos());
     157    avail_code_units = lex->AdvanceBuffer(AbsPos());
    122158    buffer_rel_pos = bitstream_scan0(buf.item_stream[item]);
    123159  }
    124160}
    125161#endif
    126 
     162#endif
    127163
    128164#ifdef OPTIMIZE_SHORT_SCAN
     
    144180#endif
    145181      AdvanceToNewBasePosn(buffer_rel_pos);
    146       rel_EOF_pos = lex->AdvanceBuffer(AbsPos());
     182      avail_code_units = lex->AdvanceBuffer(AbsPos());
    147183      buffer_rel_pos = bitstream_scan0(buf.item_stream[item]);
    148184    }
     
    157193}
    158194
    159 inline bool ParsingEngine::S_at(int offset) const {
    160   // true for 0x09, 0x0A, 0x0D, 0x20: the XML1.0 space chars.
    161   return *(x8dataPtr(offset)) <= 0x20;
     195inline bool ParsingEngine::ASCII_WS_at(int offset) const {
     196  unsigned char ch = *(x8dataPtr(offset));
     197  return (ch == 0x20) || (ch == 0x0A) || (ch == 0x0D) || (ch == 0x09);
    162198}
    163199
     
    216252inline bool ParsingEngine::at_XmlDecl_start() const {
    217253  return (s5int64(x8dataPtr(0)) == c5int64('<', '?', 'x', 'm', 'l')) &&
    218          S_at(5);
     254         ASCII_WS_at(5);
    219255}
    220256
     
    579615  xml_info.has_version_decl = true;
    580616  Advance(6);
    581   ScanTo(NonWS);
     617  ASCII_ScanTo(NonWS);
    582618  if (!at_version()) {Error_action(decl_start, AbsPos()); return;}
    583619  Advance(7);
    584   ScanTo(NonWS);
     620  ASCII_ScanTo(NonWS);
    585621  if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    586622  Advance(1);
    587   ScanTo(NonWS);
     623  ASCII_ScanTo(NonWS);
    588624  if (at_1_0()) xml_info.version = 0;
    589625  else if (at_1_1()) xml_info.version = 1;
     
    591627  Advance(5);
    592628  if (at_PI_End()) {Advance(2); return;}
    593   if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
    594   ScanTo(NonWS);
     629  if (!ASCII_WS_at(0)) {Error_action(decl_start, AbsPos()); return;}
     630  ASCII_ScanTo(NonWS);
    595631  if (at_encoding()) {
    596632      xml_info.has_encoding_decl = true;
    597633      Advance(8);
    598       ScanTo(NonWS);
     634      ASCII_ScanTo(NonWS);
    599635      if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    600636      Advance(1);
    601       ScanTo(NonWS);
     637      ASCII_ScanTo(NonWS);
    602638      xml_info.encoding_start_pos = AbsPos()+1;
    603639      if (AtChar('"')) {
     
    615651      Advance(1);
    616652      if (at_PI_End()) {Advance(2); return;}
    617       if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
    618       ScanTo(NonWS);
     653      if (!ASCII_WS_at(0)) {Error_action(decl_start, AbsPos()); return;}
     654      ASCII_ScanTo(NonWS);
    619655  }
    620656  if (at_standalone()) {
    621657      xml_info.has_standalone_decl = true;
    622658      Advance(10);
    623       ScanTo(NonWS);
     659      ASCII_ScanTo(NonWS);
    624660      if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    625661      Advance(1);
    626       ScanTo(NonWS);
     662      ASCII_ScanTo(NonWS);
    627663      if (at_yes()) {Advance(5); xml_info.standalone = true;}
    628664      else if (at_no()) {Advance(4); xml_info.standalone = false;}
    629665      else {Error_action(decl_start, AbsPos()); return;}
    630   }
    631   ScanTo(NonWS);
     666      ASCII_ScanTo(NonWS);
     667  }
    632668  if (at_PI_End()) {Advance(2); return;}
    633669  else {Error_action(decl_start, AbsPos()); return;}
     
    651687  // version information.
    652688  Advance(6);
    653   ScanTo(NonWS);
     689  ASCII_ScanTo(NonWS);
    654690  if (at_version()) {
    655691    xml_info.has_version_decl = true;
    656692    Advance(7);
    657     ScanTo(NonWS);
     693    ASCII_ScanTo(NonWS);
    658694    if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    659695    Advance(1);
    660     ScanTo(NonWS);
     696    ASCII_ScanTo(NonWS);
    661697    if (at_1_0()) xml_info.version = 0;
    662698    else if (at_1_1()) xml_info.version = 1;
     
    664700    Advance(5);
    665701    // Must have whitespace character before declaration.
    666     if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
    667     ScanTo(NonWS);
     702    if (!ASCII_WS_at(0)) {Error_action(decl_start, AbsPos()); return;}
     703    ASCII_ScanTo(NonWS);
    668704  }
    669705  if (!at_encoding()) {Error_action(decl_start, AbsPos()); return;}
    670706  xml_info.has_encoding_decl = true;
    671707  Advance(8);
    672   ScanTo(NonWS);
     708  ASCII_ScanTo(NonWS);
    673709  if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    674710  Advance(1);
    675   ScanTo(NonWS);
     711  ASCII_ScanTo(NonWS);
    676712  xml_info.encoding_start_pos = AbsPos()+1;
    677713  if (AtChar('"')) {
     
    688724  xml_info.encoding_end_pos = AbsPos();
    689725  Advance(1);
    690   ScanTo(NonWS);
     726  ASCII_ScanTo(NonWS);
    691727  if (at_PI_End()) {Advance(2); return;}
    692728  else {Error_action(decl_start, AbsPos()); return;}
Note: See TracChangeset for help on using the changeset viewer.