source: icXML/icXML-devel/src/icxercesc/internal/XMLReader.hpp @ 3157

Last change on this file since 3157 was 3157, checked in by cameron, 6 years ago

Fixes for icXML 0.9

File size: 33.9 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/*
19 * $Id: XMLReader.hpp 833045 2009-11-05 13:21:27Z borisk $
20 */
21
22#if !defined(XERCESC_INCLUDE_GUARD_XMLREADER_HPP)
23#define XERCESC_INCLUDE_GUARD_XMLREADER_HPP
24
25#include <icxercesc/util/XMLChar.hpp>
26#include <xercesc/framework/XMLErrorCodes.hpp>
27#include <icxercesc/framework/XMLRecognizer.hpp>
28#include <icxercesc/framework/XMLBuffer.hpp>
29#include <xercesc/util/TranscodingException.hpp>
30#include <icxercesc/util/TransService.hpp>
31#include <icxercesc/framework/XMLElementDecl.hpp>
32#include <xercesc/util/BinInputStream.hpp>
33#include <icxercesc/util/TransService.hpp>
34#include <icxmlc/XMLConfig.hpp>
35#include <icxmlc/Array.hpp>
36#include <icxmlc/XMLScanIterator.hpp>
37#include <icxmlc/XMLStreamIterator.hpp>
38#include <iostream>
39
40// WARNING: callgrind requires that valgrind is installed on your system. This include is not
41// necessary to use callgrind.
42#ifdef ENABLE_CALLGRIND_PROFILING
43#include <valgrind/callgrind.h>
44#else
45#define CALLGRIND_START_INSTRUMENTATION
46#define CALLGRIND_STOP_INSTRUMENTATION
47#define CALLGRIND_DUMP_STATS
48#endif
49
50XERCES_CPP_NAMESPACE_BEGIN
51
52class ReaderMgr;
53class InputSource;
54class XMLParser;
55class XMLReferenceTable;
56template <typename XMLScannerType> class XMLParserImpl;
57
58// -----------------------------------------------------------------------------------------------
59
60#if !defined(XERCESC_INCLUDE_GUARD_BININPUTSTREAM_HPP)
61class BinInputStream;
62#endif
63#if !defined(XERCESC_INCLUDE_GUARD_XMLSCANNER_HPP)
64class XMLScanner;
65#endif
66
67// ---------------------------------------------------------------------------
68//  Instances of this class are used to manage the content of entities. The
69//  scanner maintains a stack of these, one for each entity (this means entity
70//  in the sense of any parsed file or internal entity) currently being
71//  scanned. This class, given a binary input stream will handle reading in
72//  the data and decoding it from its external decoding into the internal
73//  Unicode format. Once internallized, this class provides the access
74//  methods to read in the data in various ways, maintains line and column
75//  information, and provides high performance character attribute checking
76//  methods.
77//
78//  This is NOT to be derived from.
79//
80// ---------------------------------------------------------------------------
81
82class XMLPARSER_EXPORT XMLReader : public XMemory
83{
84        friend class ReaderMgr;
85    friend class XMLReferenceTable;
86    friend class XMLParser;
87
88public:
89        // -----------------------------------------------------------------------
90        //  Public types
91        // -----------------------------------------------------------------------
92        enum Types
93        {
94                Type_PE
95                , Type_General
96        };
97
98        enum Sources
99        {
100                Source_Internal
101                , Source_External
102        };
103
104        enum RefFrom
105        {
106                RefFrom_Literal
107                , RefFrom_NonLiteral
108        };
109
110    enum XMLVersion
111    {
112        XMLV1_0
113        , XMLV1_1
114        , XMLV_Unknown
115    };
116
117        // -----------------------------------------------------------------------
118        //  Public, query methods
119        // -----------------------------------------------------------------------
120        bool isAllSpaces
121        (
122                const   XMLCh* const    toCheck
123                , const XMLSize_t       count
124        ) const;
125
126        bool containsWhiteSpace
127        (
128                const   XMLCh* const    toCheck
129                , const XMLSize_t       count
130        ) const;
131
132        bool isXMLLetter(const XMLCh toCheck) const;
133        bool isFirstNameChar(const XMLCh toCheck) const;
134        bool isNameChar(const XMLCh toCheck) const;
135        bool isPlainContentChar(const XMLCh toCheck) const;
136        bool isSpecialStartTagChar(const XMLCh toCheck) const;
137        bool isXMLChar(const XMLCh toCheck) const;
138        bool isWhitespace(const XMLCh toCheck) const;
139        bool isControlChar(const XMLCh toCheck) const;
140        bool isPublicIdChar(const XMLCh toCheck) const;
141        bool isFirstNCNameChar(const XMLCh toCheck) const;
142        bool isNCNameChar(const XMLCh toCheck) const;
143
144        // -----------------------------------------------------------------------
145        //  Constructors and Destructor
146        // -----------------------------------------------------------------------
147        XMLReader
148        (
149                const   XMLCh* const          pubId
150                , const XMLCh* const          sysId
151                ,       BinInputStream* const streamToAdopt
152                , const RefFrom               from
153                , const Types                 type
154                , const Sources               source
155                , const bool                  throwAtEnd
156                , const bool                  calculateSrcOfs
157                ,       XMLSize_t             lowWaterMark
158                , const XMLVersion            xmlVersion
159                ,       MemoryManager* const  manager
160        );
161
162        XMLReader
163        (
164                const   XMLCh* const          pubId
165                , const XMLCh* const          sysId
166                ,       BinInputStream* const streamToAdopt
167                , const XMLCh* const          encodingStr
168                , const RefFrom               from
169                , const Types                 type
170                , const Sources               source
171                , const bool                  throwAtEnd
172                , const bool                  calculateSrcOfs
173                ,       XMLSize_t             lowWaterMark
174                , const XMLVersion            xmlVersion
175                ,       MemoryManager* const  manager
176        );
177
178        XMLReader
179        (
180                const   XMLCh* const          pubId
181                , const XMLCh* const          sysId
182                ,       BinInputStream* const streamToAdopt
183                , XMLRecognizer::Encodings    encodingEnum
184                , const RefFrom               from
185                , const Types                 type
186                , const Sources               source
187                , const bool                  throwAtEnd
188                , const bool                  calculateSrcOfs
189                ,       XMLSize_t             lowWaterMark
190                , const XMLVersion            xmlVersion
191                ,       MemoryManager* const  manager
192        );
193
194        ~XMLReader();
195
196
197        // -----------------------------------------------------------------------
198        //  Character buffer management methods
199        // -----------------------------------------------------------------------
200        XMLSize_t charsLeftInBuffer() const;
201        bool refreshCharBuffer();
202
203        // -----------------------------------------------------------------------
204        //  Scanning methods
205        // -----------------------------------------------------------------------
206        bool getName(XMLBuffer& toFill, const bool token);
207        bool getQName(XMLBuffer& toFill, int* colonPosition);
208        bool getNCName(XMLBuffer& toFill);
209        bool getNextChar(XMLCh& chGotten);
210        bool getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten);
211        void movePlainContentChars(XMLBuffer &dest);
212        bool getSpaces(XMLBuffer& toFill);
213        bool getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck);
214        bool peekNextChar(XMLCh& chGotten);
215        bool skipIfQuote(XMLCh& chGotten);
216        bool skipSpaces(bool& skippedSomething, bool inDecl = false);
217        bool skippedChar(const XMLCh toSkip);
218    void skipChars(const XMLSize_t charsToSkip);
219        bool skippedSpace();
220        bool skippedString(const XMLCh* const toSkip);   
221        bool skippedStringLong(const XMLCh* toSkip);
222    const XMLCh * peekString(XMLSize_t length);
223        bool peekString(const XMLCh* const toPeek);
224    bool peekString(const XMLCh* const toPeek, XMLSize_t length, bool mustBeFollowedByWhitespace = false);
225
226
227        // -----------------------------------------------------------------------
228        //  Getter methods
229        // -----------------------------------------------------------------------
230        XMLFileLoc getColumnNumber() const;
231        const XMLCh* getEncodingStr() const;
232        XMLFileLoc getLineNumber() const;
233    XMLVersion getXMLVersion() const;
234        bool getNoMoreFlag() const;
235        const XMLCh* getPublicId() const;
236        XMLSize_t getReaderNum() const;
237        RefFrom getRefFrom() const;
238        Sources getSource() const;
239        XMLFilePos getSrcOffset() const;
240        const XMLCh* getSystemId() const;
241        bool getThrowAtEnd() const;
242        Types getType() const;
243
244
245        // -----------------------------------------------------------------------
246        //  Setter methods
247        // -----------------------------------------------------------------------
248        bool setEncoding
249        (
250                const   XMLCh* const    newEncoding
251        );
252        void setReaderNum(const XMLSize_t newNum);
253        void setThrowAtEnd(const bool newValue);
254        void setXMLVersion(const XMLVersion version);
255
256        // ---------------------------------------------------------------------------
257        //  Class Constants
258        //
259        //  kCharBufSize
260        //      The size of the character spool buffer that we use. Its not terribly
261        //      large because its just getting filled with data from a raw byte
262        //      buffer as we go along. We don't want to decode all the text at
263        //      once before we find out that there is an error.
264        //
265        //      NOTE: This is a size in characters, not bytes.
266        //
267        //  kRawBufSize
268        //      The size of the raw buffer from which raw bytes are spooled out
269        //      as we transcode chunks of data. As it is emptied, it is filled back
270        //      in again from the source stream.
271        // ---------------------------------------------------------------------------
272        enum Constants
273        {
274                kCharBufSize        = 128
275                , kRawBufSize       = BUFFER_BLOCKS * BLOCK_SIZE
276        };
277
278        // -----------------------------------------------------------------------
279        // PARABIX SCANNING FUNCTIONS
280        // -----------------------------------------------------------------------
281
282        template <class XMLScannerType>
283        bool scanDocument( XMLScannerType * const scanner );
284
285        template <class XMLScannerType>
286    void scanFirst(XMLScannerType * const scanner);
287
288        template <class XMLScannerType>
289    bool scanNext();
290
291        void refreshRawBuffer(const XMLSize_t offset);
292
293        void refreshRawBuffer(const XMLSize_t from, const XMLSize_t to);
294
295        void getCurrentLineColumn(XMLFileLoc & line, XMLFileLoc & col) const;
296
297    inline void fill(XMLBuffer & toFill);
298
299private:
300        // -----------------------------------------------------------------------
301        //  Unimplemented constructors and operators
302        // -----------------------------------------------------------------------
303        XMLReader(const XMLReader&);
304        XMLReader& operator=(const XMLReader&);
305
306        // -----------------------------------------------------------------------
307        //  Private helper methods
308        // -----------------------------------------------------------------------
309
310        void doInitCharSizeChecks();
311
312        void doInitDecode();
313
314        bool constructTranscoder();
315
316        bool constructCharacterSetAdapter();
317
318        XMLByte getNextRawByte
319        (
320                const   bool            eoiOk
321        );
322
323        void setTranscoder
324        (
325                const   XMLCh* const    newEncoding
326        );
327
328        XMLSize_t xcodeMoreChars
329        (
330                                XMLCh* const            bufToFill
331                ,       unsigned char* const    charSizes
332                , const XMLSize_t               maxChars
333        );
334
335    void handleEOL(XMLCh & curCh, bool inDecl = false);
336
337        void refreshRawBuffer();
338
339    XMLSize_t calculateRawBufIndex();
340
341        void checkForSwapped();
342
343    void normalizeEOL(XMLCh* const bufToFill, unsigned char* const charSizes, XMLSize_t & charsDone, XMLSize_t & bytesEaten);
344
345        // -----------------------------------------------------------------------
346        //  Data members
347        //
348        //  fCharIndex
349        //      The index into the character buffer. When this hits fCharsAvail
350        //      then its time to refill.
351        //
352        //  fCharBuf
353        //      A buffer that the reader manager fills up with transcoded
354        //      characters a small amount at a time.
355        //
356        //  fCharsAvail
357        //      The characters currently available in the character buffer.
358        //
359        //  fCharSizeBuf
360        //      This buffer is an array that contains the number of source chars
361        //      eaten to create each char in the fCharBuf buffer. So the entry
362        //      fCharSizeBuf[x] is the number of source chars that were eaten
363        //      to make the internalized char fCharBuf[x]. This only contains
364        //      useful data if fSrcOfsSupported is true.
365        //
366        //  fCharOfsBuf
367        //      This buffer is an array that contains the offset in the
368        //      fRawByteBuf buffer of each char in the fCharBuf buffer. It
369        //      only contains useful data if fSrcOfsSupported is true.
370        //
371        //  fCurCol
372        //  fCurLine
373        //      The current line and column that we are in within this reader's
374        //      text.
375        //
376        //  fEncoding
377        //      This is the rough encoding setting. This enum is set during
378        //      construction and just tells us the rough family of encoding that
379        //      we are doing.
380        //
381        //  fEncodingStr
382        //      This is the name of the encoding we are using. It will be
383        //      provisionally set during construction, from the auto-sensed
384        //      encoding. But it might be overridden when the XMLDecl is finally
385        //      seen by the scanner. It can also be forced to a particular
386        //      encoding, in which case fForcedEncoding is set.
387        //
388        //  fForcedEncoding
389        //      If the encoding if forced then this is set and all other
390        //      information will be ignored. This encoding will be taken as
391        //      gospel. This is done by calling an alternate constructor.
392        //
393        //  fNoMore
394    //      This is set when the transcoded text is exhausted and no more can
395    //      be obtained from the fStream.
396    //
397    //  fEOF
398    //      This is set when the source input is exhausted.
399        //
400        //  fRawBufIndex
401        //      The current index into the raw byte buffer. When its equal to
402        //      fRawBytesAvail then we need to read another buffer.
403        //
404        //  fRawByteBuf
405        //      This is the raw byte buffer that is used to spool out bytes
406        //      from into the fCharBuf buffer, as we transcode in blocks.
407        //
408        //  fRawBytesAvail
409        //      The number of bytes currently available in the raw buffer. This
410        //      helps deal with the last buffer's worth, which will usually not
411        //      be a full one.
412        //
413        //  fReaderNum
414        //      Each reader from a particular reader manager (which means from a
415        //      particular document) is given a unique number. The reader manager
416        //      sets these numbers. They are used to catch things like partial
417        //      markup errors.
418        //
419        //  fRefFrom
420        //      This flag is provided in the ctor, and tells us if we represent
421        //      some entity being expanded inside a literal. Sometimes things
422        //      happen differently inside and outside literals.
423        //
424        //  fPublicId
425        //  fSystemId
426        //      These are the system and public ids of the source that this
427        //      reader is reading.
428        //
429        //  fSource
430        //      Indicates whether the content this reader is spooling as already
431        //      been internalized. This will prevent multiple processing of
432        //      whitespace when an already internalized entity is being spooled
433        //      out.
434        //
435        //  fSpareChar
436        //      Some encodings can create two chars in an atomic way, e.g.
437        //      surrogate pairs. We might not be able to store both, so we store
438        //      it here until the next buffer transcoding operation.
439        //
440        //  fSrcOfsBase
441        //      This is the base offset within the source of this entity. Values
442        //      in the curent fCharSizeBuf array are relative to this value.
443        //
444        //  fSrcOfsSupported
445        //      This flag is set to indicate whether source byte offset info
446        //      is supported. For intrinsic encodings, its always set since we
447        //      can always support it. For transcoder based encodings, we ask
448        //      the transcoder if it supports it or not.
449        //
450        //  fStream
451        //      This is the input stream that provides the data for the reader.
452        //      Its always treated as a raw byte stream. The derived class will
453        //      ask for buffers of text from it and will handle making some
454        //      sense of it.
455        //
456        //  fSwapped
457        //      If the encoding is one of the ones we do intrinsically, and its
458        //      in a different byte order from our native order, then this is
459        //      set to remind us to byte swap it during transcoding.
460        //
461        //  fThrowAtEnd
462        //      Indicates whether the reader manager should throw an end of entity
463        //      exception at the end of this reader instance. This is usually
464        //      set for top level external entity references. It overrides the
465        //      reader manager's global flag that controls throwing at the end
466        //      of entities. Defaults to false.
467        //
468        //  fTranscoder
469        //      If the encoding is not one that we handle intrinsically, then
470        //      we use an an external transcoder to do it. This class is an
471        //      abstraction that allows us to use pluggable external transcoding
472        //      services (via XMLTransService in util.)
473        //
474        //  fType
475        //      Indicates whether this reader represents a PE or not. If this
476        //      flag is true and the fInLiteral flag is false, then we will put
477        //      out an extra space at the end.
478        //
479        //
480        //  fNEL
481    //      Boolean indicates if NELs and LSEPs should be recognized as EOLs
482        //
483        //  fXMLVersion
484        //      Enum to indicate if this Reader is conforming to XML 1.0 or XML 1.1
485        // -----------------------------------------------------------------------
486
487        XMLSize_t                   fCharIndex;
488    XMLCh                       fCharBuf[kCharBufSize + 1];
489        XMLSize_t                   fCharsAvail;
490    unsigned char               fCharSizeBuf[kCharBufSize + 1];
491        XMLFileLoc                  fCurCol;
492        XMLFileLoc                  fCurLine;
493        XMLRecognizer::Encodings    fEncoding;
494        const XMLCh *               fEncodingStr;
495        bool                        fForcedEncoding;
496        bool                        fNoMore;
497    bool                        fEOF;
498        XMLCh*                      fPublicId;
499        XMLSize_t                   fRawBufIndex;
500        XMLByte                     fRawByteBuf[kRawBufSize + sizeof(BytePack)];
501        XMLSize_t                   fRawBytesAvail;
502        XMLSize_t                   fRawBytesRead;
503        XMLSize_t                   fReaderNum;
504        RefFrom                     fRefFrom;
505        Sources                     fSource;
506        XMLFilePos                  fSrcOfsBase;
507        XMLCh*                      fSystemId;
508        BinInputStream*             fStream;
509        bool                        fSwapped;
510        bool                        fThrowAtEnd;
511        XMLTranscoder*              fTranscoder;
512        XMLCharacterSetAdapter*     fCharacterSetAdapter;
513        Types                       fType;
514        bool                        fNEL;
515        XMLVersion                  fXMLVersion;
516        MemoryManager*              fMemoryManager;
517        // the internal parser is stored as void to bypass the fact the parser is templated
518        XMLParser*                  fParser;
519};
520
521// ---------------------------------------------------------------------------
522//  XMLReader: Public, query methods
523// ---------------------------------------------------------------------------
524inline bool XMLReader::isNameChar(const XMLCh toCheck) const
525{
526    return XMLNameChar::isNameChar(toCheck);
527}
528
529inline bool XMLReader::isNCNameChar(const XMLCh toCheck) const
530{
531    return XMLNameChar::isNameChar(toCheck) && toCheck != chColon;
532}
533
534inline bool XMLReader::isFirstNameChar(const XMLCh toCheck) const
535{
536    return XMLNameChar::isNameStart(toCheck);
537}
538
539inline bool XMLReader::isFirstNCNameChar(const XMLCh toCheck) const
540{
541    return XMLNameChar::isNameStart(toCheck) && toCheck != chColon;
542}
543
544inline bool XMLReader::isXMLChar(const XMLCh toCheck) const
545{
546    return XMLNameChar::isXMLChar(toCheck, fXMLVersion == XMLReader::XMLV1_0);
547}
548
549inline bool XMLReader::isXMLLetter(const XMLCh toCheck) const
550{
551    return isNCNameChar(toCheck) && (toCheck != chUnderscore);
552}
553
554inline bool XMLReader::isWhitespace(const XMLCh toCheck) const
555{
556    return XMLNameChar::isWhitespace(toCheck, !fNEL);
557}
558
559inline bool XMLReader::isControlChar(const XMLCh toCheck) const
560{
561    return (toCheck < 0x20) && XMLNameChar::isXMLChar(toCheck, fXMLVersion == XMLReader::XMLV1_0);
562}
563
564// ---------------------------------------------------------------------------
565//  XMLReader: Buffer management methods
566// ---------------------------------------------------------------------------
567inline XMLSize_t XMLReader::charsLeftInBuffer() const
568{
569        return fCharsAvail - fCharIndex;
570}
571
572// ---------------------------------------------------------------------------
573//  XMLReader: Getter methods
574// ---------------------------------------------------------------------------
575inline XMLFileLoc XMLReader::getColumnNumber() const
576{
577    return fCurCol;
578}
579
580inline const XMLCh* XMLReader::getEncodingStr() const
581{
582        return fEncodingStr;
583}
584
585inline XMLFileLoc XMLReader::getLineNumber() const
586{
587    return fCurLine;
588}
589
590inline XMLReader::XMLVersion XMLReader::getXMLVersion() const
591{
592    return fXMLVersion;
593}
594
595inline bool XMLReader::getNoMoreFlag() const
596{
597    return fNoMore;
598}
599
600inline const XMLCh* XMLReader::getPublicId() const
601{
602        return fPublicId;
603}
604
605inline XMLSize_t XMLReader::getReaderNum() const
606{
607        return fReaderNum;
608}
609
610inline XMLReader::RefFrom XMLReader::getRefFrom() const
611{
612        return fRefFrom;
613}
614
615inline XMLReader::Sources XMLReader::getSource() const
616{
617        return fSource;
618}
619
620inline const XMLCh* XMLReader::getSystemId() const
621{
622        return fSystemId;
623}
624
625inline bool XMLReader::getThrowAtEnd() const
626{
627        return fThrowAtEnd;
628}
629
630inline XMLReader::Types XMLReader::getType() const
631{
632        return fType;
633}
634
635// ---------------------------------------------------------------------------
636//  XMLReader: Setter methods
637// ---------------------------------------------------------------------------
638inline void XMLReader::setReaderNum(const XMLSize_t newNum)
639{
640        fReaderNum = newNum;
641}
642
643inline void XMLReader::setThrowAtEnd(const bool newValue)
644{
645        fThrowAtEnd = newValue;
646}
647
648inline void XMLReader::setXMLVersion(const XMLVersion version)
649{
650        fXMLVersion = version;
651    const bool NEL = (version == XMLV1_1) ? true : XMLChar1_0::enableNEL;
652    if (unlikely(NEL & !fNEL))
653    {
654        XMLSize_t avail = (fCharsAvail - fCharIndex);
655        XMLSize_t bytesEaten;
656        normalizeEOL(&fCharBuf[fCharIndex], &fCharSizeBuf[fCharIndex], avail, bytesEaten);
657        fCharsAvail = fCharIndex + avail;
658    }
659    fNEL = NEL;
660}
661
662// ---------------------------------------------------------------------------
663//  XMLReader: getNextChar() method inlined for speed
664// ---------------------------------------------------------------------------
665inline bool XMLReader::getNextChar(XMLCh& chGotten)
666{
667        //
668        //  See if there is at least a char in the buffer. Else, do the buffer
669        //  reload logic.
670        //
671    if (unlikely(fCharIndex == fCharsAvail))
672        {
673                if (unlikely(!refreshCharBuffer()))
674                        return false;
675        }
676
677        chGotten = fCharBuf[fCharIndex++];
678    handleEOL(chGotten, false);
679        return true;
680}
681
682inline void XMLReader::skipChars(const XMLSize_t charsToSkip)
683{
684    XMLCh temp;
685    for (XMLSize_t c = charsToSkip; c-- && getNextChar(temp); );
686}
687
688// ---------------------------------------------------------------------------
689//  XMLReader: peekNextChar() method inlined for speed
690// ---------------------------------------------------------------------------
691inline bool XMLReader::peekNextChar(XMLCh& chGotten)
692{
693        //  If there is something still in the buffer, get it. Else do the reload
694        //  scenario.
695        //
696    if (unlikely(fCharIndex == fCharsAvail))
697        {
698                // Try to refresh the buffer
699                if (unlikely(!refreshCharBuffer()))
700                {
701                        chGotten = chNull;
702                        return false;
703                }
704        }
705
706        chGotten = fCharBuf[fCharIndex];
707
708        //
709        //  Even though we are only peeking, we have to act the same as the
710        //  normal char get method in regards to newline normalization, though
711        //  its not as complicated as the actual character getting method's.
712        //
713        if ((chGotten == chCR || (fNEL && (chGotten == chNEL || chGotten == chLineSeparator)))
714                && (fSource == Source_External))
715                chGotten = chLF;
716
717        return true;
718}
719
720/***
721 *
722 * XML1.1
723 *
724 * 2.11 End-of-Line Handling
725 *
726 *    XML parsed entities are often stored in computer files which, for editing
727 *    convenience, are organized into lines. These lines are typically separated
728 *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
729 *
730 *    To simplify the tasks of applications, the XML processor MUST behave as if
731 *    it normalized all line breaks in external parsed entities (including the document
732 *    entity) on input, before parsing, by translating all of the following to a single
733 *    #xA character:
734 *
735 *  1. the two-character sequence #xD #xA
736 *  2. the two-character sequence #xD #x85
737 *  3. the single character #x85
738 *  4. the single character #x2028
739 *  5. any #xD character that is not immediately followed by #xA or #x85.
740 *
741 *
742 ***/
743
744inline void XMLReader::normalizeEOL(XMLCh* const bufToFill, unsigned char* const charSizes, XMLSize_t & charsDone, XMLSize_t & bytesEaten)
745{
746    ubitblock deletedAnyCharacters = {0};
747    // do end-of-line normalization on the transcoded input data
748    XMLChIterator<chCR> crItr(bufToFill, charsDone);
749
750    size_t pos = 0;
751
752    if (likely(!fNEL))
753    {
754        // do XML 1.0 end of line normalization
755        while (unlikely(crItr.next()))
756        {
757            pos = crItr.pos();
758            bufToFill[pos++] = chLF;
759            // did this buffer end with a CR?
760            if (unlikely(pos == kCharBufSize))
761            {
762                // push back the CR just to be sure that we arent dealing with a CRLF
763                bytesEaten -= charSizes[--charsDone];
764                DEBUG_MESSAGE(" -- pushback(CR)")
765                break;
766            }
767            if (unlikely(bufToFill[pos] == chLF))
768            {
769                // if we found a {CR,LF}, then mark the LF for deletion
770                #ifdef __ARCH_64
771                deletedAnyCharacters._64[pos >> CONST_LOG_2(64)] |= (static_cast<uint64_t>(1) << (pos & 63));
772                #else
773                deletedAnyCharacters._32[pos >> CONST_LOG_2(32)] |= (static_cast<uint32_t>(1) << (pos & 31));
774                #endif
775            }
776        }
777    }
778    else // if (fScanner.getXMLVersion() == XMLReader::XMLV1_1) // can only be this; would be a decl error otherwise
779    {
780        // do XML 1.1 specific end of line normalization
781        while (unlikely(crItr.next()))
782        {
783            pos = crItr.pos();
784            bufToFill[pos++] = chLF;
785            // did this buffer end with a CR?
786            if (unlikely(pos == kCharBufSize))
787            {
788                // push back the CR just to be sure that we arent dealing with a CRLF
789                bytesEaten -= charSizes[--charsDone];
790                DEBUG_MESSAGE(" -- pushback(CR)")
791                break;
792            }
793            if (unlikely((bufToFill[pos] == chLF) || (bufToFill[pos] == chNEL)))
794            {
795                // if we found a {CR,LF} or a {CR,NEL}, then mark the LF/NEL for deletion
796                #ifdef __ARCH_64
797                deletedAnyCharacters._64[pos >> CONST_LOG_2(64)] |= (static_cast<uint64_t>(1) << (pos & 63));
798                #else
799                deletedAnyCharacters._32[pos >> CONST_LOG_2(32)] |= (static_cast<uint32_t>(1) << (pos & 31));
800                #endif
801            }
802        }
803
804        XMLChIterator2<chNEL, chLineSeparator> nonLfItr(bufToFill, charsDone);
805        while (unlikely(nonLfItr.next()))
806        {
807            bufToFill[nonLfItr.pos()] = chLF;
808        }
809    }
810
811    // remove all deleted characters from the semi-final production
812    if (unlikely(bitblock::any(deletedAnyCharacters._128)))
813    {
814        XMLStreamIterator itr(deletedAnyCharacters);
815        // get the position of the first deleted character
816        itr.next();
817        size_t outputPos = itr.pos();
818        size_t lastPos;
819        // now scan for any other deleted characters
820        for (; lastPos = itr.pos() + 1, itr.next(); )
821        {
822            const size_t len = itr.pos() - lastPos;
823            charSizes[outputPos - 1] += charSizes[lastPos - 1];
824            Array<unsigned char>::move(&charSizes[lastPos], &charSizes[outputPos], len);
825
826            Array<XMLCh>::move(&bufToFill[lastPos], &bufToFill[outputPos], len);
827
828            outputPos += len;
829        }
830        // no more deleted characters; just append the remainder of the string
831        const size_t len = charsDone - lastPos;
832        charSizes[outputPos - 1] += charSizes[lastPos - 1];
833        Array<unsigned char>::move(&charSizes[lastPos], &charSizes[outputPos], len);
834        Array<XMLCh>::move(&bufToFill[lastPos], &bufToFill[outputPos], len);               
835        charsDone = outputPos + len;
836    }
837}
838
839inline XMLSize_t XMLReader::calculateRawBufIndex()
840{
841    // See if we have any existing chars.
842    XMLSize_t spareChars = fCharsAvail - fCharIndex;
843    XMLSize_t rawBufIndex = fRawBufIndex;
844    //
845    //  If there are spare chars, then adjust the raw buffer index
846    //
847    if (spareChars)
848    {
849        XMLSize_t offset = fCharSizeBuf[fCharIndex];
850        for (XMLSize_t index = fCharIndex + 1; --spareChars; index++)
851        {
852            offset += fCharSizeBuf[index];
853        }
854        rawBufIndex -= offset;
855        DEBUG_MESSAGE(" --- fRawBufIndex'=" << fRawBufIndex)
856    }
857    return rawBufIndex;
858}
859
860// ---------------------------------------------------------------------------------------
861
862XERCES_CPP_NAMESPACE_END
863
864#include <icxmlc/XMLParser.hpp>
865
866XERCES_CPP_NAMESPACE_BEGIN
867
868// ---------------------------------------------------------------------------------------
869
870template<class XMLScannerType>
871bool
872XMLReader::scanDocument( XMLScannerType * const scanner )
873{
874    DEBUG_MESSAGE("XMLReader::scanDocument()")
875
876    // construct the actual XMLParser object to parse the ELEMENT and MISC nodes
877    XMLParserImpl<XMLScannerType> parser(scanner, fMemoryManager);
878
879    //  Scan the PROLOG part, which is everything before the root element
880        //  including the DTD subsets.
881        scanner->scanProlog();
882
883    // now construct the character set adapter
884    if (unlikely(!constructCharacterSetAdapter()))
885    {
886        scanner->emitError(XMLErrs::BadXMLEncoding, fEncodingStr);
887        return 0;
888    }
889
890    refreshRawBuffer(calculateRawBufIndex(), 0);
891
892    parser.init(fCharacterSetAdapter, fTranscoder, fCurLine, fCurCol, fXMLVersion);
893
894        fParser = (XMLParser*)&parser;
895
896        CALLGRIND_START_INSTRUMENTATION;
897
898    DEBUG_TRANSITION_MESSAGE("######################################################################");
899    DEBUG_TRANSITION_MESSAGE("SCANNING ELEMENT(S)")
900    DEBUG_TRANSITION_MESSAGE("######################################################################");
901
902    for (;;)
903    {
904        // transform the new raw data into symbol and content streams
905        parser.preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
906
907        parser.buildElementPage();
908
909        if (unlikely(!parser.scanElementPage())) break;
910
911        parser.prepareForNextDocumentPage();
912    }
913
914    DEBUG_TRANSITION_MESSAGE("######################################################################");
915    DEBUG_TRANSITION_MESSAGE("SCANNING MISCELLANEOUS")
916    DEBUG_TRANSITION_MESSAGE("######################################################################");
917
918    while (unlikely(parser.scanMiscellaneousPage()))
919    {
920        parser.prepareForNextDocumentPage();
921
922        // transform the new raw data into symbol and content streams
923        parser.preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
924    }
925
926
927        CALLGRIND_STOP_INSTRUMENTATION;
928        CALLGRIND_DUMP_STATS;
929
930        fNoMore = true;
931
932        fParser = 0;
933
934        return 1;
935}
936
937// -------------------------------------------------------------------------------------
938
939template <class XMLScannerType>
940IDISA_ALWAYS_INLINE
941void XMLReader::scanFirst( XMLScannerType * const scanner )
942{
943    DEBUG_MESSAGE("XMLReader::scanFirst()")
944
945    // construct the actual XMLParser object to parse the ELEMENT and MISC nodes
946    XMLParserImpl<XMLScannerType> * parser = new XMLParserImpl<XMLScannerType>(scanner, fMemoryManager);
947
948    //  Scan the PROLOG part, which is everything before the root element
949        //  including the DTD subsets.
950        scanner->scanProlog();
951
952        // now construct the character set adapter
953        if (unlikely(!constructCharacterSetAdapter()))
954        {
955                scanner->emitError(XMLErrs::BadXMLEncoding, fEncodingStr);
956        return;
957        }
958
959    refreshRawBuffer(calculateRawBufIndex(), 0);
960
961    parser->init(fCharacterSetAdapter, fTranscoder, fCurLine, fCurCol, fXMLVersion);
962
963    fParser = reinterpret_cast<XMLParser*>(parser);
964
965        // transform the new raw data into symbol and content streams
966    parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
967
968    parser->buildElementPage();
969}
970
971// -------------------------------------------------------------------------------------
972
973template <class XMLScannerType>
974IDISA_ALWAYS_INLINE
975bool XMLReader::scanNext()
976{
977    XMLParserImpl<XMLScannerType> * const parser = reinterpret_cast<XMLParserImpl<XMLScannerType>*>(fParser);
978
979    if (likely(parser->scanNext()))
980    {
981        return true;
982    }
983
984    if (likely(parser->inElement()))
985    {
986        DEBUG_MESSAGE(" **** CONTINUE ELEMENT SPAN **** ")
987
988        parser->prepareForNextDocumentPage();
989
990        parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
991
992        parser->buildElementPage();
993
994        if (likely(parser->scanNext()))
995        {
996            return true;
997        }
998    }
999
1000    DEBUG_MESSAGE(" **** BEGIN MISCELLANEOUS SPAN **** ")
1001
1002    while (unlikely(parser->scanMiscellaneousPage()))
1003    {
1004        // if there is no new data; we're done (and the file is likely in error)
1005        if (likely(fEOF)) break;
1006
1007        parser->prepareForNextDocumentPage();
1008
1009        // transform the new raw data into symbol and content streams
1010        parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
1011    }
1012
1013    delete fParser;
1014    fParser = 0;
1015    fNoMore = true;
1016    return 0;
1017}
1018
1019inline
1020void XMLReader::getCurrentLineColumn(XMLFileLoc & line, XMLFileLoc & col) const
1021{
1022        if (likely(fParser != NULL))
1023        {
1024                fParser->getCurrentLineCol(line, col);
1025        }
1026        else // we're in Xerces prolog mode!
1027        {
1028                line = fCurLine;
1029                col = fCurCol;
1030        }
1031}
1032
1033void XMLReader::fill(XMLBuffer & toFill)
1034{
1035    DEBUG_MESSAGE("XMLReader::fill(toFill) fCharIndex=" << fCharIndex << ", fCharsAvail=" << fCharsAvail)
1036
1037    do
1038    {
1039        toFill.append(&fCharBuf[fCharIndex], fCharsAvail - fCharIndex);
1040        fCharIndex = fCharsAvail;
1041    }
1042    while (refreshCharBuffer());
1043}
1044
1045// ---------------------------------------------------------------------------
1046// DEPRECATED FUNCTIONS
1047// ---------------------------------------------------------------------------
1048inline void XMLReader::movePlainContentChars(XMLBuffer &)
1049{
1050        DEPRECATED_FEATURE_IN_ICXML;
1051}
1052
1053inline bool XMLReader::getNextCharIfNot(const XMLCh, XMLCh &)
1054{
1055        DEPRECATED_FEATURE_IN_ICXML;
1056}
1057
1058inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) const
1059{
1060    DEPRECATED_FEATURE_IN_ICXML;
1061}
1062
1063inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) const
1064{
1065    DEPRECATED_FEATURE_IN_ICXML;
1066}
1067
1068#undef CHAR_SIZE
1069#undef SYMBOL_POSITION
1070#undef SYMBOL_ARRAY_SIZE
1071#undef CONSTRUCT_CHARACTER_SET_ADAPTER
1072
1073XERCES_CPP_NAMESPACE_END
1074
1075#endif
Note: See TracBrowser for help on using the repository browser.