Changeset 15 for trunk/src


Ignore:
Timestamp:
Jan 11, 2008, 6:16:25 AM (11 years ago)
Author:
cameron
Message:

Bytespace scanning in XML declarations; various updates

Location:
trunk/src
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/bitlex.c

    r14 r15  
    1313#include "transpose.h"
    1414
    15 
     15#include "stdlib.h"
    1616
    1717Lexer::Lexer(XML_Buffer *b, ParallelStreamSet *p) {
    1818  xml_buf = b;
     19  if (posix_memalign((void **) &bit_group, sizeof(BitBlock), sizeof(BitBlockGroup)) != 0) {
     20    printf("Allocation failure for local BitBlockGroup in Lexer\n");
     21    exit(-1);
     22  }
     23  for (int i = 0; i < 8; i++) (*bit_group)[0].bit[i] = simd_const_1(0);
    1924  parsing_engine_data = p;
    2025};
     
    105110/* A temporary structure for internal use in ComputeLexicalItemStreams. */
    106111typedef struct {
    107   BitBlock bit[8];
    108112  BitBlock LexicalItems[LexicalItemCount];
    109113} LexicalItemBlock;
     
    113117  LexicalItemBlock lx_blk[BUFFER_BLOCKS];
    114118  for (int i = 0; i < new_blocks; i++) {
    115     s2p_bytepack(&(parsing_engine_data->x8data[i * 8]), lx_blk[i].bit);
    116     ComputeLexicalItemBlocks(lx_blk[i].bit, lx_blk[i].LexicalItems);
     119    s2p_bytepack(&(parsing_engine_data->x8data[i * 8]), (*bit_group)[i+1].bit);
     120    ComputeLexicalItemBlocks((*bit_group)[i+1].bit, lx_blk[i].LexicalItems);
    117121  }
    118122#ifdef BUFFER_PROFILING
  • trunk/src/bitlex.h

    r14 r15  
    4646
    4747enum lexical_item {
    48   MarkupStart, CD_End_check, Hyphen, QMark, DQuote, SQuote,
    49   NonWS, NameFollow};
     48  minLexicalItem = 0,
     49  MarkupStart = minLexicalItem, CD_End_check, Hyphen, QMark,
     50  DQuote, SQuote, NonWS, NameFollow,
     51  maxLexicalItem = NameFollow};
    5052
    51 const int LexicalItemCount = NameFollow + 1;
     53const int LexicalItemCount = maxLexicalItem + 1;
    5254
    5355/* The principal role of the lexical analyzer is to prepare
     
    7072   
    7173*/
    72 const int LOOKAHEAD_POSITIONS = 8;
     74const int LOOKAHEAD_POSITIONS = 12;
    7375const int LOOKAHEAD_PACKS = (LOOKAHEAD_POSITIONS+PACKSIZE-1)/PACKSIZE;
    7476const int SENTINEL_BLOCKS = 1;
     
    7779        BytePack x8data[BUFFER_PACKS+LOOKAHEAD_PACKS];
    7880        BitBlock item_stream[LexicalItemCount][BUFFER_BLOCKS+SENTINEL_BLOCKS];
     81};
     82
     83struct BitBlockGroup {
     84  BitBlock bit[8];
    7985};
    8086
     
    9399protected:
    94100   XML_Buffer *xml_buf;
     101   BitBlockGroup *(bit_group[BUFFER_BLOCKS+1]);
    95102   ParallelStreamSet *parsing_engine_data;
    96103   void ComputeLexicalItemStreams(int newblocks);
  • trunk/src/charsets/ext_ascii_16.c

    r8 r15  
    2828
    2929int Ext_ASCII_16BE_Lexer::AdvanceBuffer(int newpos) {
    30   unsigned char * buf_ptr = xml_buf->GetBytePtr(newpos * X16_CODE_UNIT_BYTES);
    31   int avail_bytes = xml_buf->PrepareBytes(X16_BYTE_STREAM_SIZE);
     30  int byte_pos = newpos * X16_CODE_UNIT_BYTES;
     31  int avail_bytes = xml_buf->PrepareBytes(byte_pos, X16_BYTE_STREAM_SIZE);
     32  unsigned char * buf_ptr = xml_buf->BytePtr(byte_pos);
    3233  int new_blocks = min(BUFFER_BLOCKS,
    3334                       (avail_bytes/X16_CODE_UNIT_BYTES + BLOCKSIZE - 1)/BLOCKSIZE);
     
    6465
    6566int Ext_ASCII_16LE_Lexer::AdvanceBuffer(int newpos) {
    66   unsigned char * buf_ptr = xml_buf->GetBytePtr(newpos * X16_CODE_UNIT_BYTES);
    67   int avail_bytes = xml_buf->PrepareBytes(X16_BYTE_STREAM_SIZE);
     67  int byte_pos = newpos * X16_CODE_UNIT_BYTES;
     68  int avail_bytes = xml_buf->PrepareBytes(byte_pos, X16_BYTE_STREAM_SIZE);
     69  unsigned char * buf_ptr = xml_buf->BytePtr(byte_pos);
    6870  int new_blocks = min(BUFFER_BLOCKS,
    6971                       (avail_bytes/X16_CODE_UNIT_BYTES + BLOCKSIZE - 1)/BLOCKSIZE);
  • trunk/src/charsets/ext_ascii_8.c

    r8 r15  
    1111
    1212int Ext_ASCII_8_Lexer::AdvanceBuffer(int newpos) {
    13   unsigned char * buf_ptr = xml_buf->GetBytePtr(newpos * CODE_UNIT_BYTES);
    14   int avail_bytes = xml_buf->PrepareBytes(BYTE_STREAM_SIZE);
     13  int byte_pos = newpos * CODE_UNIT_BYTES;
     14  int avail_bytes = xml_buf->PrepareBytes(byte_pos, BYTE_STREAM_SIZE);
     15  unsigned char * buf_ptr = xml_buf->BytePtr(byte_pos);
    1516  int new_blocks = min(BUFFER_BLOCKS, (avail_bytes + BLOCKSIZE - 1)/BLOCKSIZE);
    1617  int new_packs = new_blocks * 8 + 1;
  • trunk/src/engine.c

    r14 r15  
    4343
    4444void ParsingEngine::InitLexer() {
    45   unsigned char * XML_signature = xml_buf->GetBytePtr(0);
    46   if (xml_buf->PrepareBytes(4) < 4) {
     45  unsigned char * sentinel = (unsigned char *) "<![--?>]]>/'>\"><!";
     46  if (xml_buf->PrepareBytes(0, 4) < 4) {
    4747    printf("No XML input document.\n");
    4848    exit(-1);
    4949  }
     50  unsigned char * XML_signature = xml_buf->BytePtr(0);
     51  xml_buf->InstallPadding(sentinel);
     52
    5053  Charset_Family family = Charset_Family_Detect(XML_signature);
    5154  switch (family) {
     
    6871      exit(-1);
    6972  }
    70   rel_EOF_pos = lex-> AdvanceBuffer(0);
     73  avail_code_units = lex-> AdvanceBuffer(0);
    7174}
    7275
     
    7780}
    7881
     82inline unsigned char ParsingEngine::CharAt(int offset) const {
     83  return ((unsigned char *) buf.x8data)[buffer_rel_pos+offset];
     84}
     85
    7986inline unsigned char * ParsingEngine::x8dataPtr(int offset) const {
    8087  return &((unsigned char *) buf.x8data)[buffer_rel_pos+offset];
     
    9198}
    9299
     100
    93101inline bool ParsingEngine::at_EOF () const {
    94   return buffer_rel_pos >= rel_EOF_pos;
     102  return (buffer_rel_pos >= avail_code_units) &&
     103         (avail_code_units < BUFFER_BLOCKS * BLOCKSIZE + LOOKAHEAD_POSITIONS);
    95104}
    96105
     
    104113#endif
    105114    AdvanceToNewBasePosn(buffer_rel_pos);
    106     rel_EOF_pos = lex->AdvanceBuffer(AbsPos());
    107   }
     115    avail_code_units = lex->AdvanceBuffer(AbsPos());
     116  }
     117#endif
     118}
     119
     120inline void ParsingEngine::ASCII_ScanTo(int item) {
     121#ifdef DEBUG_BYTESCAN
     122  int p1 = AbsPos();
     123#endif
     124  switch (item) {
     125    case NonWS: while (ASCII_WS_at(0)) Advance(1); break;
     126    case MarkupStart: while(!AtChar('<') && !AtChar('&') && !at_CDATA_End()) Advance(1); break;
     127    case CD_End_check: while(!at_CDATA_End()) Advance(1); break;
     128    case Hyphen: while(!AtChar('-')) Advance(1); break;
     129    case QMark: while(!AtChar('?')) Advance(1); break;
     130    case DQuote: while(!AtChar('<') && !AtChar('&') && !AtChar('"')) Advance(1); break;
     131    case SQuote: while(!AtChar('<') && !AtChar('&') && !AtChar('\'')) Advance(1); break;
     132    case NameFollow: while(!ASCII_WS_at(0) && !AtChar(';') && !AtChar('/') && !AtChar('>')
     133                      && !AtChar('=') && !AtChar('?')) Advance(1); break;
     134  }
     135#ifdef DEBUG_BYTESCAN
     136  printf("ASCII_ScanTo(%i) %i -> %i\n", item, p1, AbsPos());
    108137#endif
    109138}
    110139
    111140#ifndef OPTIMIZE_SHORT_SCAN
     141#ifdef BYTESPACE_SCAN
     142inline void ParsingEngine::ScanTo(int item) {
     143  ASCII_ScanTo(item);
     144}
     145#endif
     146
     147#ifndef BYTESPACE_SCAN
    112148inline void ParsingEngine::ScanTo(int item) {
    113149  buffer_rel_pos = bitstream_scan(buf.item_stream[item],
     
    119155#endif
    120156    AdvanceToNewBasePosn(buffer_rel_pos);
    121     rel_EOF_pos = lex->AdvanceBuffer(AbsPos());
     157    avail_code_units = lex->AdvanceBuffer(AbsPos());
    122158    buffer_rel_pos = bitstream_scan0(buf.item_stream[item]);
    123159  }
    124160}
    125161#endif
    126 
     162#endif
    127163
    128164#ifdef OPTIMIZE_SHORT_SCAN
     
    144180#endif
    145181      AdvanceToNewBasePosn(buffer_rel_pos);
    146       rel_EOF_pos = lex->AdvanceBuffer(AbsPos());
     182      avail_code_units = lex->AdvanceBuffer(AbsPos());
    147183      buffer_rel_pos = bitstream_scan0(buf.item_stream[item]);
    148184    }
     
    157193}
    158194
    159 inline bool ParsingEngine::S_at(int offset) const {
    160   // true for 0x09, 0x0A, 0x0D, 0x20: the XML1.0 space chars.
    161   return *(x8dataPtr(offset)) <= 0x20;
     195inline bool ParsingEngine::ASCII_WS_at(int offset) const {
     196  unsigned char ch = *(x8dataPtr(offset));
     197  return (ch == 0x20) || (ch == 0x0A) || (ch == 0x0D) || (ch == 0x09);
    162198}
    163199
     
    216252inline bool ParsingEngine::at_XmlDecl_start() const {
    217253  return (s5int64(x8dataPtr(0)) == c5int64('<', '?', 'x', 'm', 'l')) &&
    218          S_at(5);
     254         ASCII_WS_at(5);
    219255}
    220256
     
    579615  xml_info.has_version_decl = true;
    580616  Advance(6);
    581   ScanTo(NonWS);
     617  ASCII_ScanTo(NonWS);
    582618  if (!at_version()) {Error_action(decl_start, AbsPos()); return;}
    583619  Advance(7);
    584   ScanTo(NonWS);
     620  ASCII_ScanTo(NonWS);
    585621  if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    586622  Advance(1);
    587   ScanTo(NonWS);
     623  ASCII_ScanTo(NonWS);
    588624  if (at_1_0()) xml_info.version = 0;
    589625  else if (at_1_1()) xml_info.version = 1;
     
    591627  Advance(5);
    592628  if (at_PI_End()) {Advance(2); return;}
    593   if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
    594   ScanTo(NonWS);
     629  if (!ASCII_WS_at(0)) {Error_action(decl_start, AbsPos()); return;}
     630  ASCII_ScanTo(NonWS);
    595631  if (at_encoding()) {
    596632      xml_info.has_encoding_decl = true;
    597633      Advance(8);
    598       ScanTo(NonWS);
     634      ASCII_ScanTo(NonWS);
    599635      if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    600636      Advance(1);
    601       ScanTo(NonWS);
     637      ASCII_ScanTo(NonWS);
    602638      xml_info.encoding_start_pos = AbsPos()+1;
    603639      if (AtChar('"')) {
     
    615651      Advance(1);
    616652      if (at_PI_End()) {Advance(2); return;}
    617       if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
    618       ScanTo(NonWS);
     653      if (!ASCII_WS_at(0)) {Error_action(decl_start, AbsPos()); return;}
     654      ASCII_ScanTo(NonWS);
    619655  }
    620656  if (at_standalone()) {
    621657      xml_info.has_standalone_decl = true;
    622658      Advance(10);
    623       ScanTo(NonWS);
     659      ASCII_ScanTo(NonWS);
    624660      if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    625661      Advance(1);
    626       ScanTo(NonWS);
     662      ASCII_ScanTo(NonWS);
    627663      if (at_yes()) {Advance(5); xml_info.standalone = true;}
    628664      else if (at_no()) {Advance(4); xml_info.standalone = false;}
    629665      else {Error_action(decl_start, AbsPos()); return;}
    630   }
    631   ScanTo(NonWS);
     666      ASCII_ScanTo(NonWS);
     667  }
    632668  if (at_PI_End()) {Advance(2); return;}
    633669  else {Error_action(decl_start, AbsPos()); return;}
     
    651687  // version information.
    652688  Advance(6);
    653   ScanTo(NonWS);
     689  ASCII_ScanTo(NonWS);
    654690  if (at_version()) {
    655691    xml_info.has_version_decl = true;
    656692    Advance(7);
    657     ScanTo(NonWS);
     693    ASCII_ScanTo(NonWS);
    658694    if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    659695    Advance(1);
    660     ScanTo(NonWS);
     696    ASCII_ScanTo(NonWS);
    661697    if (at_1_0()) xml_info.version = 0;
    662698    else if (at_1_1()) xml_info.version = 1;
     
    664700    Advance(5);
    665701    // Must have whitespace character before declaration.
    666     if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
    667     ScanTo(NonWS);
     702    if (!ASCII_WS_at(0)) {Error_action(decl_start, AbsPos()); return;}
     703    ASCII_ScanTo(NonWS);
    668704  }
    669705  if (!at_encoding()) {Error_action(decl_start, AbsPos()); return;}
    670706  xml_info.has_encoding_decl = true;
    671707  Advance(8);
    672   ScanTo(NonWS);
     708  ASCII_ScanTo(NonWS);
    673709  if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
    674710  Advance(1);
    675   ScanTo(NonWS);
     711  ASCII_ScanTo(NonWS);
    676712  xml_info.encoding_start_pos = AbsPos()+1;
    677713  if (AtChar('"')) {
     
    688724  xml_info.encoding_end_pos = AbsPos();
    689725  Advance(1);
    690   ScanTo(NonWS);
     726  ASCII_ScanTo(NonWS);
    691727  if (at_PI_End()) {Advance(2); return;}
    692728  else {Error_action(decl_start, AbsPos()); return;}
  • trunk/src/engine.h

    r12 r15  
    4646
    4747                  bool AtChar(unsigned char c) const;
    48                   bool S_at(int offset) const;
     48                  unsigned char CharAt(int offset) const;
     49                  bool ASCII_WS_at(int offset) const;
    4950                  bool at_EndTag_Start() const;  // at "</"
    5051                  bool at_Comment_Start() const;
     
    7374                  /* Mutators that advance the input. */
    7475                  void Advance(int n);
     76                  void ASCII_ScanTo(int lex_item);
    7577                  void ScanTo(int lex_item);
    7678                  void AdvanceToNewBasePosn(int advance_amt);
     
    9294                  int buffer_base_pos;
    9395                  int buffer_rel_pos;
    94                   int rel_EOF_pos;
     96                  int avail_code_units;
    9597
    9698                 /* Parallel data streams for current buffer full of XML data. */
  • trunk/src/xmlbuffer.c

    r4 r15  
    4848}
    4949
    50 int XML_Buffer::PrepareBytes(int bytes_to_prepare) {
    51   int bytes_left = buffer_bytes - current_pos;
     50int XML_Buffer::PrepareBytes(int position, int bytes_to_prepare) {
     51  int bytes_left = buffer_bytes - position;
     52  current_pos = position;
    5253  if (bytes_to_prepare > bytes_left) {
    5354    return bytes_left;
     
    5859}
    5960
    60 unsigned char * XML_Buffer::GetBytePtr(int pos) {
    61   current_pos = pos;
    62   return &ByteBuffer[current_pos];
     61unsigned char * XML_Buffer::BytePtr(int pos) {
     62  return &ByteBuffer[pos];
    6363}
  • trunk/src/xmlbuffer.h

    r4 r15  
    2020        XML_Buffer(char* filename, int pad_bytes);
    2121        void InstallPadding(const unsigned char* pad_string);
    22         int PrepareBytes(int bytes_to_prepare);
    23         unsigned char * GetBytePtr(int pos);
     22        int PrepareBytes(int position, int bytes_to_prepare);
     23        unsigned char * BytePtr(int pos);
    2424private:
    2525        unsigned char * ByteBuffer;
Note: See TracChangeset for help on using the changeset viewer.