Changeset 11 for trunk/src


Ignore:
Timestamp:
Dec 24, 2007, 8:52:08 AM (11 years ago)
Author:
cameron
Message:

Reading XML declaration: version/encoding.

Location:
trunk/src
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/engine.c

    r9 r11  
    1717#include <errno.h>
    1818#include <string.h>
    19 #include <sys/types.h>
    20 #include <sys/stat.h>
    21 
    22 char sentinel[] = "<]]>?>-->'>\">";
    2319
    2420ParsingEngine::ParsingEngine (char * filename) {
     
    4642  buffer_base_pos = 0;
    4743  buffer_rel_pos = 0;
    48   xml_buf = new XML_Buffer::XML_Buffer(filename, 4*strlen(sentinel));
     44  xml_buf = new XML_Buffer::XML_Buffer(filename, BLOCKSIZE);
    4945}
    5046
     
    164160}
    165161
     162inline bool ParsingEngine::S_at(int offset) const {
     163  // true for 0x09, 0x0A, 0x0D, 0x20: the XML1.0 space chars.
     164  return *(x8dataPtr(offset)) <= 0x20;
     165}
     166
    166167#include "multiliteral.h"
    167168/* Now the XML recognizers. */
     
    215216  return s2int16(x8dataPtr(0)) == c2int16('/', '>');
    216217}
     218
     219inline bool ParsingEngine::at_XmlDecl_start() const {
     220  return (s5int64(x8dataPtr(0)) == c5int64('<', '?', 'x', 'm', 'l')) &&
     221         S_at(5);
     222}
     223
     224inline bool ParsingEngine::at_version() const {
     225  return s7int64(x8dataPtr(0)) == c7int64('v', 'e', 'r', 's', 'i', 'o', 'n');
     226}
     227
     228inline bool ParsingEngine::at_1_0() const {
     229  return (s5int64(x8dataPtr(0)) == c5int64('"', '1', '.', '0', '"')) ||
     230         (s5int64(x8dataPtr(0)) == c5int64('\'', '1', '.', '0', '\''));
     231}
     232
     233inline bool ParsingEngine::at_1_1() const {
     234  return (s5int64(x8dataPtr(0)) == c5int64('"', '1', '.', '1', '"')) ||
     235         (s5int64(x8dataPtr(0)) == c5int64('\'', '1', '.', '1', '\''));
     236}
     237
     238inline bool ParsingEngine::at_encoding() const {
     239  return s8int64(x8dataPtr(0)) == c8int64('e', 'n', 'c', 'o', 'd', 'i', 'n', 'g');
     240}
     241
     242inline bool ParsingEngine::at_standalone() const {
     243  return (s8int64(x8dataPtr(0)) == c8int64('s', 't', 'a', 'n', 'd', 'a', 'l', 'o')) &
     244         (s2int16(x8dataPtr(8)) == c2int16('n', 'e'));
     245}
     246
     247inline bool ParsingEngine::at_yes() const {
     248  return (s5int64(x8dataPtr(0)) == c5int64('"', 'y', 'e', 's', '"')) |
     249         (s5int64(x8dataPtr(0)) == c5int64('\'', 'y', 'e', 's', '\''));
     250}
     251
     252inline bool ParsingEngine::at_no() const {
     253  return (s4int32(x8dataPtr(0)) == c4int32('"', 'n', 'o', '"')) |
     254         (s4int32(x8dataPtr(0)) == c4int32('\'', 'n', 'o', '\''));
     255}
     256
     257
     258
    217259
    218260
     
    316358        int target_start = AbsPos();
    317359        ScanTo(NameFollow);  /* Name delimiter */
     360        // Check for illegal [Xx][Mm][Ll] target.
     361        if ((AbsPos() - markup_start == 5) &&
     362            ((s3int32(x8dataPtr(-3)) | c3int32(0x20, 0x20, 0x20)) == c3int32('x', 'm', 'l'))) {
     363                Error_action(markup_start, AbsPos());
     364                return;
     365        }
    318366        PI_Target_action(target_start, AbsPos());
    319367        ScanTo(QMark);
     
    500548#endif
    501549}
     550
     551//
     552// The following does not yet validate the syntax of EncNames.
     553// EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
     554// Future approach: first use lookup in EncNameTable,
     555//           if not found, do case convert, try again,
     556//             (avoids cost of case convert normally)
     557//           if not found, validate syntax of EncNames,
     558//           report error or EncName unknown.
     559//
     560void ParsingEngine::ReadXmlInfo(Entity_Declaration_Info& xml_info) {
     561  int BOM = lex->BOM_size(0);
     562  xml_info.has_ByteOrderMark = BOM > 0;
     563  xml_info.has_version_decl = false;
     564  xml_info.has_encoding_decl = false;
     565  xml_info.has_standalone_decl = false;
     566  Advance(BOM);
     567  int decl_start = AbsPos();
     568  // It is possible that there is no XML declaration.
     569  if (!at_XmlDecl_start()) return;
     570  // Otherwise, the XML declaration exists and must have
     571  // at least version information.
     572  xml_info.has_version_decl = true;
     573  Advance(6);
     574  ScanTo(NonWS);
     575  if (!at_version()) {Error_action(decl_start, AbsPos()); return;}
     576  Advance(7);
     577  ScanTo(NonWS);
     578  if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
     579  Advance(1);
     580  ScanTo(NonWS);
     581  if (at_1_0()) xml_info.version = 0;
     582  else if (at_1_1()) xml_info.version = 1;
     583  else {Error_action(decl_start, AbsPos()); return;}
     584  Advance(5);
     585  xml_info.has_version_decl = true;
     586  if (at_PI_End()) {Advance(2); return;}
     587  if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
     588  ScanTo(NonWS);
     589  if (at_encoding()) {
     590      xml_info.has_encoding_decl = true;
     591      Advance(8);
     592      ScanTo(NonWS);
     593      if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
     594      Advance(1);
     595      ScanTo(NonWS);
     596      xml_info.encoding_start_pos = AbsPos()+1;
     597      if (AtChar('"')) {
     598        Advance(1);
     599        ScanTo(DQuote);
     600        if (!AtChar('"')) {Error_action(decl_start, AbsPos()); return;}
     601      }
     602      else if (AtChar('\'')) {
     603        Advance(1);
     604        ScanTo(SQuote);
     605        if (!AtChar('\'')) {Error_action(decl_start, AbsPos()); return;}
     606      }
     607      else {Error_action(decl_start, AbsPos()); return;}
     608      xml_info.encoding_end_pos = AbsPos();
     609      Advance(1);
     610      if (at_PI_End()) {Advance(2); return;}
     611      if (!S_at(0)) {Error_action(decl_start, AbsPos()); return;}
     612      ScanTo(NonWS);
     613  }
     614  if (at_standalone()) {
     615      xml_info.has_standalone_decl = true;
     616      Advance(10);
     617      ScanTo(NonWS);
     618      if (!AtChar('=')) {Error_action(decl_start, AbsPos()); return;}
     619      Advance(1);
     620      ScanTo(NonWS);
     621      if (at_yes()) {Advance(5); xml_info.standalone = true;}
     622      else if (at_no()) {Advance(4); xml_info.standalone = false;}
     623      else {Error_action(decl_start, AbsPos()); return;}
     624  }
     625  ScanTo(NonWS);
     626  if (at_PI_End()) {Advance(2); return;}
     627  else {Error_action(decl_start, AbsPos()); return;}
     628}
  • trunk/src/engine.h

    r7 r11  
    1111#include "bitlex.h"
    1212
     13// Information about the character set encoding, XML version and
     14// standalone status of an XML entity.
     15struct Entity_Declaration_Info
     16  {bool has_ByteOrderMark;
     17   bool has_version_decl;
     18   bool has_encoding_decl;
     19   bool has_standalone_decl;
     20   int version;
     21   int encoding_start_pos;
     22   int encoding_end_pos;
     23   bool standalone;
     24};
     25
     26
    1327/* A ParsingEngine is the principal class for parsing XML
    1428data.  */
     
    1933                  void ParseContent();
    2034                  void InitLexer();
     35                  void ReadXmlInfo (Entity_Declaration_Info& xml_info);
    2136          protected:
    2237
     
    3045
    3146                  bool AtChar(unsigned char c) const;
     47                  bool S_at(int offset) const;
    3248                  bool at_EndTag_Start() const;  // at "</"
    3349                  bool at_Comment_Start() const;
     
    4359                  bool at_EmptyElementDelim() const;
    4460                  bool at_ElementTag_Start() const;
     61                  bool at_XmlDecl_start() const;
     62                  bool at_version() const;
     63                  bool at_1_0() const;
     64                  bool at_1_1() const;
     65                  bool at_encoding() const;
     66                  bool at_standalone() const;
     67                  bool at_yes() const;
     68                  bool at_no() const;
    4569
    4670                  bool at_EOF () const;
     
    5074                  void ScanTo(int lex_item);
    5175                  void AdvanceToNewBasePosn(int advance_amt);
    52 
    5376
    5477                  /* Parsing routines. */
  • trunk/src/multiliteral.h

    r4 r11  
    6060}
    6161
    62 /*  Specialized helpers for 3 and 5 character combinations. */
     62/*  Specialized helpers for 3, 5, 6, and 7 character combinations. */
    6363
    6464static inline uint32_t c3int32(unsigned char c1, unsigned char c2,
     
    7171                               unsigned char c5) {
    7272  return c8int64(c1, c2, c3, c4, c5, 0, 0, 0);
     73}
     74
     75static inline uint64_t c6int64(unsigned char c1, unsigned char c2,
     76                               unsigned char c3, unsigned char c4,
     77                               unsigned char c5, unsigned char c6) {
     78  return c8int64(c1, c2, c3, c4, c5, c6, 0, 0);
     79}
     80
     81static inline uint64_t c7int64(unsigned char c1, unsigned char c2,
     82                               unsigned char c3, unsigned char c4,
     83                               unsigned char c5, unsigned char c6,
     84                               unsigned char c7) {
     85  return c8int64(c1, c2, c3, c4, c5, c6, c7, 0);
    7386}
    7487
     
    98111}
    99112
     113static inline uint64_t s6int64(unsigned char s[]) {
     114  return s8int64(s) & c6int64(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
     115}
     116
     117static inline uint64_t s7int64(unsigned char s[]) {
     118  return s8int64(s) & c7int64(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
     119}
     120
    100121#endif
Note: See TracChangeset for help on using the changeset viewer.