source: icXML/icXML-devel/src/icxercesc/internal/XMLReader.hpp

Last change on this file was 3564, checked in by cameron, 6 years ago

Changes to icxercesc files

File size: 33.3 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/*
19 * $Id: XMLReader.hpp 833045 2009-11-05 13:21:27Z borisk $
20 */
21
22#if !defined(XERCESC_INCLUDE_GUARD_XMLREADER_HPP)
23#define XERCESC_INCLUDE_GUARD_XMLREADER_HPP
24
25#include <icxercesc/util/XMLChar.hpp>
26#include <xercesc/framework/XMLErrorCodes.hpp>
27#include <icxercesc/framework/XMLRecognizer.hpp>
28#include <icxercesc/framework/XMLBuffer.hpp>
29#include <xercesc/util/TranscodingException.hpp>
30#include <icxercesc/util/TransService.hpp>
31#include <icxercesc/framework/XMLElementDecl.hpp>
32#include <xercesc/util/BinInputStream.hpp>
33#include <icxercesc/util/TransService.hpp>
34#include <icxmlc/XMLConfig.hpp>
35#include <icxmlc/Array.hpp>
36#include <icxmlc/XMLScanIterator.hpp>
37#include <icxmlc/XMLStreamIterator.hpp>
38#include <icxmlc/bit_vector_t.hpp>
39#include <iostream>
40
41// WARNING: callgrind requires that valgrind is installed on your system. This include is not
42// necessary to use callgrind.
43#ifdef ENABLE_CALLGRIND_PROFILING
44#include <valgrind/callgrind.h>
45#else
46#define CALLGRIND_START_INSTRUMENTATION
47#define CALLGRIND_STOP_INSTRUMENTATION
48#define CALLGRIND_DUMP_STATS
49#endif
50
51XERCES_CPP_NAMESPACE_BEGIN
52
53class ReaderMgr;
54class InputSource;
55class XMLParser;
56class XMLReferenceTable;
57template <typename XMLScannerType> class XMLParserImpl;
58
59// -----------------------------------------------------------------------------------------------
60
61#if !defined(XERCESC_INCLUDE_GUARD_BININPUTSTREAM_HPP)
62class BinInputStream;
63#endif
64#if !defined(XERCESC_INCLUDE_GUARD_XMLSCANNER_HPP)
65class XMLScanner;
66#endif
67
68// ---------------------------------------------------------------------------
69//  Instances of this class are used to manage the content of entities. The
70//  scanner maintains a stack of these, one for each entity (this means entity
71//  in the sense of any parsed file or internal entity) currently being
72//  scanned. This class, given a binary input stream will handle reading in
73//  the data and decoding it from its external decoding into the internal
74//  Unicode format. Once internallized, this class provides the access
75//  methods to read in the data in various ways, maintains line and column
76//  information, and provides high performance character attribute checking
77//  methods.
78//
79//  This is NOT to be derived from.
80//
81// ---------------------------------------------------------------------------
82
83class XMLPARSER_EXPORT XMLReader : public XMemory
84{
85        friend class ReaderMgr;
86    friend class XMLReferenceTable;
87    friend class XMLParser;
88
89public:
90        // -----------------------------------------------------------------------
91        //  Public types
92        // -----------------------------------------------------------------------
93        enum Types
94        {
95                Type_PE
96                , Type_General
97        };
98
99        enum Sources
100        {
101                Source_Internal
102                , Source_External
103        };
104
105        enum RefFrom
106        {
107                RefFrom_Literal
108                , RefFrom_NonLiteral
109        };
110
111    enum XMLVersion
112    {
113        XMLV1_0
114        , XMLV1_1
115        , XMLV_Unknown
116    };
117
118        // -----------------------------------------------------------------------
119        //  Public, query methods
120        // -----------------------------------------------------------------------
121        bool isAllSpaces
122        (
123                const   XMLCh* const    toCheck
124                , const XMLSize_t       count
125        ) const;
126
127        bool containsWhiteSpace
128        (
129                const   XMLCh* const    toCheck
130                , const XMLSize_t       count
131        ) const;
132
133        bool isXMLLetter(const XMLCh toCheck) const;
134        bool isFirstNameChar(const XMLCh toCheck) const;
135        bool isNameChar(const XMLCh toCheck) const;
136        bool isPlainContentChar(const XMLCh toCheck) const;
137        bool isSpecialStartTagChar(const XMLCh toCheck) const;
138        bool isXMLChar(const XMLCh toCheck) const;
139        bool isWhitespace(const XMLCh toCheck) const;
140        bool isControlChar(const XMLCh toCheck) const;
141        bool isPublicIdChar(const XMLCh toCheck) const;
142        bool isFirstNCNameChar(const XMLCh toCheck) const;
143        bool isNCNameChar(const XMLCh toCheck) const;
144
145        // -----------------------------------------------------------------------
146        //  Constructors and Destructor
147        // -----------------------------------------------------------------------
148        XMLReader
149        (
150                const   XMLCh* const          pubId
151                , const XMLCh* const          sysId
152                ,       BinInputStream* const streamToAdopt
153                , const RefFrom               from
154                , const Types                 type
155                , const Sources               source
156                , const bool                  throwAtEnd
157                , const bool                  calculateSrcOfs
158                ,       XMLSize_t             lowWaterMark
159                , const XMLVersion            xmlVersion
160                ,       MemoryManager* const  manager
161        );
162
163        XMLReader
164        (
165                const   XMLCh* const          pubId
166                , const XMLCh* const          sysId
167                ,       BinInputStream* const streamToAdopt
168                , const XMLCh* const          encodingStr
169                , const RefFrom               from
170                , const Types                 type
171                , const Sources               source
172                , const bool                  throwAtEnd
173                , const bool                  calculateSrcOfs
174                ,       XMLSize_t             lowWaterMark
175                , const XMLVersion            xmlVersion
176                ,       MemoryManager* const  manager
177        );
178
179        XMLReader
180        (
181                const   XMLCh* const          pubId
182                , const XMLCh* const          sysId
183                ,       BinInputStream* const streamToAdopt
184                , XMLRecognizer::Encodings    encodingEnum
185                , const RefFrom               from
186                , const Types                 type
187                , const Sources               source
188                , const bool                  throwAtEnd
189                , const bool                  calculateSrcOfs
190                ,       XMLSize_t             lowWaterMark
191                , const XMLVersion            xmlVersion
192                ,       MemoryManager* const  manager
193        );
194
195        ~XMLReader();
196
197
198        // -----------------------------------------------------------------------
199        //  Character buffer management methods
200        // -----------------------------------------------------------------------
201        XMLSize_t charsLeftInBuffer() const;
202        bool refreshCharBuffer();
203
204        // -----------------------------------------------------------------------
205        //  Scanning methods
206        // -----------------------------------------------------------------------
207        bool getName(XMLBuffer& toFill, const bool token);
208        bool getQName(XMLBuffer& toFill, int* colonPosition);
209        bool getNCName(XMLBuffer& toFill);
210        bool getNextChar(XMLCh& chGotten);
211        bool getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten);
212        void movePlainContentChars(XMLBuffer &dest);
213        bool getSpaces(XMLBuffer& toFill);
214        bool getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck);
215        bool peekNextChar(XMLCh& chGotten);
216        bool skipIfQuote(XMLCh& chGotten);
217        bool skipSpaces(bool& skippedSomething, bool inDecl = false);
218        bool skippedChar(const XMLCh toSkip);
219    void skipChars(const XMLSize_t charsToSkip);
220        bool skippedSpace();
221        bool skippedString(const XMLCh* const toSkip);   
222        bool skippedStringLong(const XMLCh* toSkip);
223    const XMLCh * peekString(XMLSize_t length);
224        bool peekString(const XMLCh* const toPeek);
225    bool peekString(const XMLCh* const toPeek, XMLSize_t length, bool mustBeFollowedByWhitespace = false);
226
227
228        // -----------------------------------------------------------------------
229        //  Getter methods
230        // -----------------------------------------------------------------------
231        XMLFileLoc getColumnNumber() const;
232        const XMLCh* getEncodingStr() const;
233        XMLFileLoc getLineNumber() const;
234    XMLVersion getXMLVersion() const;
235        bool getNoMoreFlag() const;
236        const XMLCh* getPublicId() const;
237        XMLSize_t getReaderNum() const;
238        RefFrom getRefFrom() const;
239        Sources getSource() const;
240        XMLFilePos getSrcOffset() const;
241        const XMLCh* getSystemId() const;
242        bool getThrowAtEnd() const;
243        Types getType() const;
244
245
246        // -----------------------------------------------------------------------
247        //  Setter methods
248        // -----------------------------------------------------------------------
249        bool setEncoding
250        (
251                const   XMLCh* const    newEncoding
252        );
253        void setReaderNum(const XMLSize_t newNum);
254        void setThrowAtEnd(const bool newValue);
255        void setXMLVersion(const XMLVersion version);
256
257        // ---------------------------------------------------------------------------
258        //  Class Constants
259        //
260        //  kCharBufSize
261        //      The size of the character spool buffer that we use. Its not terribly
262        //      large because its just getting filled with data from a raw byte
263        //      buffer as we go along. We don't want to decode all the text at
264        //      once before we find out that there is an error.
265        //
266        //      NOTE: This is a size in characters, not bytes.
267        //
268        //  kRawBufSize
269        //      The size of the raw buffer from which raw bytes are spooled out
270        //      as we transcode chunks of data. As it is emptied, it is filled back
271        //      in again from the source stream.
272        // ---------------------------------------------------------------------------
273        enum Constants
274        {
275                kCharBufSize        = 128
276                , kRawBufSize       = BUFFER_BLOCKS * BLOCK_SIZE
277        };
278
279        // -----------------------------------------------------------------------
280        // PARABIX SCANNING FUNCTIONS
281        // -----------------------------------------------------------------------
282
283        template <class XMLScannerType>
284        bool scanDocument( XMLScannerType * const scanner );
285
286        template <class XMLScannerType>
287    void scanFirst(XMLScannerType * const scanner);
288
289        template <class XMLScannerType>
290    bool scanNext();
291
292        void refreshRawBuffer(const XMLSize_t offset);
293
294        void refreshRawBuffer(const XMLSize_t from, const XMLSize_t to);
295
296        void getCurrentLineColumn(XMLFileLoc & line, XMLFileLoc & col) const;
297
298    inline void fill(XMLBuffer & toFill);
299
300private:
301        // -----------------------------------------------------------------------
302        //  Unimplemented constructors and operators
303        // -----------------------------------------------------------------------
304        XMLReader(const XMLReader&);
305        XMLReader& operator=(const XMLReader&);
306
307        // -----------------------------------------------------------------------
308        //  Private helper methods
309        // -----------------------------------------------------------------------
310
311        void doInitCharSizeChecks();
312
313        void doInitDecode();
314
315        bool constructTranscoder();
316
317        bool constructCharacterSetAdapter();
318
319        XMLByte getNextRawByte
320        (
321                const   bool            eoiOk
322        );
323
324        void setTranscoder
325        (
326                const   XMLCh* const    newEncoding
327        );
328
329        XMLSize_t xcodeMoreChars
330        (
331                                XMLCh* const            bufToFill
332                ,       unsigned char* const    charSizes
333                , const XMLSize_t               maxChars
334        );
335
336    void handleEOL(XMLCh & curCh, bool inDecl = false);
337
338        void refreshRawBuffer();
339
340    XMLSize_t calculateRawBufIndex();
341
342        void checkForSwapped();
343
344    void normalizeEOL(XMLCh* const bufToFill, unsigned char* const charSizes, XMLSize_t & charsDone, XMLSize_t & bytesEaten);
345
346        // -----------------------------------------------------------------------
347        //  Data members
348        //
349        //  fCharIndex
350        //      The index into the character buffer. When this hits fCharsAvail
351        //      then its time to refill.
352        //
353        //  fCharBuf
354        //      A buffer that the reader manager fills up with transcoded
355        //      characters a small amount at a time.
356        //
357        //  fCharsAvail
358        //      The characters currently available in the character buffer.
359        //
360        //  fCharSizeBuf
361        //      This buffer is an array that contains the number of source chars
362        //      eaten to create each char in the fCharBuf buffer. So the entry
363        //      fCharSizeBuf[x] is the number of source chars that were eaten
364        //      to make the internalized char fCharBuf[x]. This only contains
365        //      useful data if fSrcOfsSupported is true.
366        //
367        //  fCharOfsBuf
368        //      This buffer is an array that contains the offset in the
369        //      fRawByteBuf buffer of each char in the fCharBuf buffer. It
370        //      only contains useful data if fSrcOfsSupported is true.
371        //
372        //  fCurCol
373        //  fCurLine
374        //      The current line and column that we are in within this reader's
375        //      text.
376        //
377        //  fEncoding
378        //      This is the rough encoding setting. This enum is set during
379        //      construction and just tells us the rough family of encoding that
380        //      we are doing.
381        //
382        //  fEncodingStr
383        //      This is the name of the encoding we are using. It will be
384        //      provisionally set during construction, from the auto-sensed
385        //      encoding. But it might be overridden when the XMLDecl is finally
386        //      seen by the scanner. It can also be forced to a particular
387        //      encoding, in which case fForcedEncoding is set.
388        //
389        //  fForcedEncoding
390        //      If the encoding if forced then this is set and all other
391        //      information will be ignored. This encoding will be taken as
392        //      gospel. This is done by calling an alternate constructor.
393        //
394        //  fNoMore
395    //      This is set when the transcoded text is exhausted and no more can
396    //      be obtained from the fStream.
397    //
398    //  fEOF
399    //      This is set when the source input is exhausted.
400        //
401        //  fRawBufIndex
402        //      The current index into the raw byte buffer. When its equal to
403        //      fRawBytesAvail then we need to read another buffer.
404        //
405        //  fRawByteBuf
406        //      This is the raw byte buffer that is used to spool out bytes
407        //      from into the fCharBuf buffer, as we transcode in blocks.
408        //
409        //  fRawBytesAvail
410        //      The number of bytes currently available in the raw buffer. This
411        //      helps deal with the last buffer's worth, which will usually not
412        //      be a full one.
413        //
414        //  fReaderNum
415        //      Each reader from a particular reader manager (which means from a
416        //      particular document) is given a unique number. The reader manager
417        //      sets these numbers. They are used to catch things like partial
418        //      markup errors.
419        //
420        //  fRefFrom
421        //      This flag is provided in the ctor, and tells us if we represent
422        //      some entity being expanded inside a literal. Sometimes things
423        //      happen differently inside and outside literals.
424        //
425        //  fPublicId
426        //  fSystemId
427        //      These are the system and public ids of the source that this
428        //      reader is reading.
429        //
430        //  fSource
431        //      Indicates whether the content this reader is spooling as already
432        //      been internalized. This will prevent multiple processing of
433        //      whitespace when an already internalized entity is being spooled
434        //      out.
435        //
436        //  fSpareChar
437        //      Some encodings can create two chars in an atomic way, e.g.
438        //      surrogate pairs. We might not be able to store both, so we store
439        //      it here until the next buffer transcoding operation.
440        //
441        //  fSrcOfsBase
442        //      This is the base offset within the source of this entity. Values
443        //      in the curent fCharSizeBuf array are relative to this value.
444        //
445        //  fSrcOfsSupported
446        //      This flag is set to indicate whether source byte offset info
447        //      is supported. For intrinsic encodings, its always set since we
448        //      can always support it. For transcoder based encodings, we ask
449        //      the transcoder if it supports it or not.
450        //
451        //  fStream
452        //      This is the input stream that provides the data for the reader.
453        //      Its always treated as a raw byte stream. The derived class will
454        //      ask for buffers of text from it and will handle making some
455        //      sense of it.
456        //
457        //  fSwapped
458        //      If the encoding is one of the ones we do intrinsically, and its
459        //      in a different byte order from our native order, then this is
460        //      set to remind us to byte swap it during transcoding.
461        //
462        //  fThrowAtEnd
463        //      Indicates whether the reader manager should throw an end of entity
464        //      exception at the end of this reader instance. This is usually
465        //      set for top level external entity references. It overrides the
466        //      reader manager's global flag that controls throwing at the end
467        //      of entities. Defaults to false.
468        //
469        //  fTranscoder
470        //      If the encoding is not one that we handle intrinsically, then
471        //      we use an an external transcoder to do it. This class is an
472        //      abstraction that allows us to use pluggable external transcoding
473        //      services (via XMLTransService in util.)
474        //
475        //  fType
476        //      Indicates whether this reader represents a PE or not. If this
477        //      flag is true and the fInLiteral flag is false, then we will put
478        //      out an extra space at the end.
479        //
480        //
481        //  fNEL
482    //      Boolean indicates if NELs and LSEPs should be recognized as EOLs
483        //
484        //  fXMLVersion
485        //      Enum to indicate if this Reader is conforming to XML 1.0 or XML 1.1
486        // -----------------------------------------------------------------------
487
488        XMLSize_t                   fCharIndex;
489    XMLCh                       fCharBuf[kCharBufSize + 1];
490        XMLSize_t                   fCharsAvail;
491    unsigned char               fCharSizeBuf[kCharBufSize + 1];
492        XMLFileLoc                  fCurCol;
493        XMLFileLoc                  fCurLine;
494        XMLRecognizer::Encodings    fEncoding;
495        const XMLCh *               fEncodingStr;
496        bool                        fForcedEncoding;
497        bool                        fNoMore;
498    bool                        fEOF;
499        XMLCh*                      fPublicId;
500        XMLSize_t                   fRawBufIndex;
501        XMLByte                     fRawByteBuf[kRawBufSize + sizeof(BytePack)];
502        XMLSize_t                   fRawBytesAvail;
503        XMLSize_t                   fRawBytesRead;
504        XMLSize_t                   fReaderNum;
505        RefFrom                     fRefFrom;
506        Sources                     fSource;
507        XMLFilePos                  fSrcOfsBase;
508        XMLCh*                      fSystemId;
509        BinInputStream*             fStream;
510        bool                        fSwapped;
511        bool                        fThrowAtEnd;
512        XMLTranscoder*              fTranscoder;
513        XMLCharacterSetAdapter*     fCharacterSetAdapter;
514        Types                       fType;
515        bool                        fNEL;
516        XMLVersion                  fXMLVersion;
517        MemoryManager*              fMemoryManager;
518        // the internal parser is stored as void to bypass the fact the parser is templated
519        XMLParser*                  fParser;
520};
521
522// ---------------------------------------------------------------------------
523//  XMLReader: Public, query methods
524// ---------------------------------------------------------------------------
525inline bool XMLReader::isNameChar(const XMLCh toCheck) const
526{
527    return XMLNameChar::isNameChar(toCheck);
528}
529
530inline bool XMLReader::isNCNameChar(const XMLCh toCheck) const
531{
532    return XMLNameChar::isNameChar(toCheck) && toCheck != chColon;
533}
534
535inline bool XMLReader::isFirstNameChar(const XMLCh toCheck) const
536{
537    return XMLNameChar::isNameStart(toCheck);
538}
539
540inline bool XMLReader::isFirstNCNameChar(const XMLCh toCheck) const
541{
542    return XMLNameChar::isNameStart(toCheck) && toCheck != chColon;
543}
544
545inline bool XMLReader::isXMLChar(const XMLCh toCheck) const
546{
547    return XMLNameChar::isXMLChar(toCheck, fXMLVersion == XMLReader::XMLV1_0);
548}
549
550inline bool XMLReader::isXMLLetter(const XMLCh toCheck) const
551{
552    return isNCNameChar(toCheck) && (toCheck != chUnderscore);
553}
554
555inline bool XMLReader::isWhitespace(const XMLCh toCheck) const
556{
557    return XMLNameChar::isWhitespace(toCheck, !fNEL);
558}
559
560inline bool XMLReader::isControlChar(const XMLCh toCheck) const
561{
562    return (toCheck < 0x20) && XMLNameChar::isXMLChar(toCheck, fXMLVersion == XMLReader::XMLV1_0);
563}
564
565inline void XMLReader::getCurrentLineColumn(XMLFileLoc & line, XMLFileLoc & col) const
566{
567    line = fCurLine;
568    col = fCurCol;
569}
570
571// ---------------------------------------------------------------------------
572//  XMLReader: Buffer management methods
573// ---------------------------------------------------------------------------
574inline XMLSize_t XMLReader::charsLeftInBuffer() const
575{
576        return fCharsAvail - fCharIndex;
577}
578
579// ---------------------------------------------------------------------------
580//  XMLReader: Getter methods
581// ---------------------------------------------------------------------------
582inline XMLFileLoc XMLReader::getColumnNumber() const
583{
584    return fCurCol;
585}
586
587inline const XMLCh* XMLReader::getEncodingStr() const
588{
589        return fEncodingStr;
590}
591
592inline XMLFileLoc XMLReader::getLineNumber() const
593{
594    return fCurLine;
595}
596
597inline XMLReader::XMLVersion XMLReader::getXMLVersion() const
598{
599    return fXMLVersion;
600}
601
602inline bool XMLReader::getNoMoreFlag() const
603{
604    return fNoMore;
605}
606
607inline const XMLCh* XMLReader::getPublicId() const
608{
609        return fPublicId;
610}
611
612inline XMLSize_t XMLReader::getReaderNum() const
613{
614        return fReaderNum;
615}
616
617inline XMLReader::RefFrom XMLReader::getRefFrom() const
618{
619        return fRefFrom;
620}
621
622inline XMLReader::Sources XMLReader::getSource() const
623{
624        return fSource;
625}
626
627inline const XMLCh* XMLReader::getSystemId() const
628{
629        return fSystemId;
630}
631
632inline bool XMLReader::getThrowAtEnd() const
633{
634        return fThrowAtEnd;
635}
636
637inline XMLReader::Types XMLReader::getType() const
638{
639        return fType;
640}
641
642// ---------------------------------------------------------------------------
643//  XMLReader: Setter methods
644// ---------------------------------------------------------------------------
645inline void XMLReader::setReaderNum(const XMLSize_t newNum)
646{
647        fReaderNum = newNum;
648}
649
650inline void XMLReader::setThrowAtEnd(const bool newValue)
651{
652        fThrowAtEnd = newValue;
653}
654
655inline void XMLReader::setXMLVersion(const XMLVersion version)
656{
657        fXMLVersion = version;
658    const bool NEL = (version == XMLV1_1) ? true : XMLChar1_0::enableNEL;
659    if (unlikely(NEL & !fNEL))
660    {
661        XMLSize_t avail = (fCharsAvail - fCharIndex);
662        XMLSize_t bytesEaten;
663        normalizeEOL(&fCharBuf[fCharIndex], &fCharSizeBuf[fCharIndex], avail, bytesEaten);
664        fCharsAvail = fCharIndex + avail;
665    }
666    fNEL = NEL;
667}
668
669// ---------------------------------------------------------------------------
670//  XMLReader: getNextChar() method inlined for speed
671// ---------------------------------------------------------------------------
672inline bool XMLReader::getNextChar(XMLCh& chGotten)
673{
674        //
675        //  See if there is at least a char in the buffer. Else, do the buffer
676        //  reload logic.
677        //
678    if (unlikely(fCharIndex == fCharsAvail))
679        {
680                if (unlikely(!refreshCharBuffer()))
681                        return false;
682        }
683
684        chGotten = fCharBuf[fCharIndex++];
685    handleEOL(chGotten, false);
686        return true;
687}
688
689inline void XMLReader::skipChars(const XMLSize_t charsToSkip)
690{
691    XMLCh temp;
692    for (XMLSize_t c = charsToSkip; c-- && getNextChar(temp); );
693}
694
695// ---------------------------------------------------------------------------
696//  XMLReader: peekNextChar() method inlined for speed
697// ---------------------------------------------------------------------------
698inline bool XMLReader::peekNextChar(XMLCh& chGotten)
699{
700        //  If there is something still in the buffer, get it. Else do the reload
701        //  scenario.
702        //
703    if (unlikely(fCharIndex == fCharsAvail))
704        {
705                // Try to refresh the buffer
706                if (unlikely(!refreshCharBuffer()))
707                {
708                        chGotten = chNull;
709                        return false;
710                }
711        }
712
713        chGotten = fCharBuf[fCharIndex];
714
715        //
716        //  Even though we are only peeking, we have to act the same as the
717        //  normal char get method in regards to newline normalization, though
718        //  its not as complicated as the actual character getting method's.
719        //
720        if ((chGotten == chCR || (fNEL && (chGotten == chNEL || chGotten == chLineSeparator)))
721                && (fSource == Source_External))
722                chGotten = chLF;
723
724        return true;
725}
726
727/***
728 *
729 * XML1.1
730 *
731 * 2.11 End-of-Line Handling
732 *
733 *    XML parsed entities are often stored in computer files which, for editing
734 *    convenience, are organized into lines. These lines are typically separated
735 *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
736 *
737 *    To simplify the tasks of applications, the XML processor MUST behave as if
738 *    it normalized all line breaks in external parsed entities (including the document
739 *    entity) on input, before parsing, by translating all of the following to a single
740 *    #xA character:
741 *
742 *  1. the two-character sequence #xD #xA
743 *  2. the two-character sequence #xD #x85
744 *  3. the single character #x85
745 *  4. the single character #x2028
746 *  5. any #xD character that is not immediately followed by #xA or #x85.
747 *
748 *
749 ***/
750
751inline void XMLReader::normalizeEOL(XMLCh* const bufToFill, unsigned char* const charSizes, XMLSize_t & charsDone, XMLSize_t & bytesEaten)
752{
753    IntAlignedBitVector deletedAnyCharacters;
754    // do end-of-line normalization on the transcoded input data
755    XMLChIterator<chCR> crItr(bufToFill, charsDone);
756
757    size_t pos = 0;
758
759    if (likely(!fNEL))
760    {
761        // do XML 1.0 end of line normalization
762        while (unlikely(crItr.next()))
763        {
764            pos = crItr.pos();
765            bufToFill[pos++] = chLF;
766            // did this buffer end with a CR?
767            if (unlikely(pos == kCharBufSize))
768            {
769                // push back the CR just to be sure that we arent dealing with a CRLF
770                bytesEaten -= charSizes[--charsDone];
771                DEBUG_MESSAGE(" -- pushback(CR)")
772                break;
773            }
774            if (unlikely(bufToFill[pos] == chLF))
775            {
776                // if we found a {CR,LF}, then mark the LF for deletion
777                deletedAnyCharacters.insert(pos);
778            }
779        }
780    }
781    else // if (fScanner.getXMLVersion() == XMLReader::XMLV1_1) // can only be this; would be a decl error otherwise
782    {
783        // do XML 1.1 specific end of line normalization
784        while (unlikely(crItr.next()))
785        {
786            pos = crItr.pos();
787            bufToFill[pos++] = chLF;
788            // did this buffer end with a CR?
789            if (unlikely(pos == kCharBufSize))
790            {
791                // push back the CR just to be sure that we arent dealing with a CRLF
792                bytesEaten -= charSizes[--charsDone];
793                DEBUG_MESSAGE(" -- pushback(CR)")
794                break;
795            }
796            if (unlikely((bufToFill[pos] == chLF) || (bufToFill[pos] == chNEL)))
797            {
798                // if we found a {CR,LF} or a {CR,NEL}, then mark the LF/NEL for deletion
799                deletedAnyCharacters.insert(pos);
800            }
801        }
802
803        XMLChIterator2<chNEL, chLineSeparator> nonLfItr(bufToFill, charsDone);
804        while (unlikely(nonLfItr.next()))
805        {
806            bufToFill[nonLfItr.pos()] = chLF;
807        }
808    }
809
810    // remove all deleted characters from the semi-final production
811    if (unlikely(bitblock::any(deletedAnyCharacters._b)))
812    {
813        XMLStreamIterator itr(deletedAnyCharacters._b);
814        // get the position of the first deleted character
815        itr.next();
816        size_t outputPos = itr.pos();
817        size_t lastPos;
818        // now scan for any other deleted characters
819        for (; lastPos = itr.pos() + 1, itr.next(); )
820        {
821            const size_t len = itr.pos() - lastPos;
822            charSizes[outputPos - 1] += charSizes[lastPos - 1];
823            Array<unsigned char>::move(&charSizes[lastPos], &charSizes[outputPos], len);
824
825            Array<XMLCh>::move(&bufToFill[lastPos], &bufToFill[outputPos], len);
826
827            outputPos += len;
828        }
829        // no more deleted characters; just append the remainder of the string
830        const size_t len = charsDone - lastPos;
831        charSizes[outputPos - 1] += charSizes[lastPos - 1];
832        Array<unsigned char>::move(&charSizes[lastPos], &charSizes[outputPos], len);
833        Array<XMLCh>::move(&bufToFill[lastPos], &bufToFill[outputPos], len);               
834        charsDone = outputPos + len;
835    }
836}
837
838inline XMLSize_t XMLReader::calculateRawBufIndex()
839{
840    // See if we have any existing chars.
841    XMLSize_t spareChars = fCharsAvail - fCharIndex;
842    XMLSize_t rawBufIndex = fRawBufIndex;
843    //
844    //  If there are spare chars, then adjust the raw buffer index
845    //
846    if (spareChars)
847    {
848        XMLSize_t offset = fCharSizeBuf[fCharIndex];
849        for (XMLSize_t index = fCharIndex + 1; --spareChars; index++)
850        {
851            offset += fCharSizeBuf[index];
852        }
853        rawBufIndex -= offset;
854    }
855    return rawBufIndex;
856}
857
858// ---------------------------------------------------------------------------------------
859
860XERCES_CPP_NAMESPACE_END
861
862#include <icxmlc/XMLParserImpl.hpp>
863
864XERCES_CPP_NAMESPACE_BEGIN
865
866// ---------------------------------------------------------------------------------------
867
868template<class XMLScannerType>
869bool
870XMLReader::scanDocument( XMLScannerType * const scanner )
871{
872    DEBUG_MESSAGE("XMLReader::scanDocument()")
873
874    // construct the actual XMLParser object to parse the ELEMENT and MISC nodes
875    XMLParserImpl<XMLScannerType> parser(*this, *scanner, fMemoryManager);
876
877    //  Scan the PROLOG part, which is everything before the root element
878        //  including the DTD subsets.
879        scanner->scanProlog();
880
881    // now construct the character set adapter
882    if (unlikely(!constructCharacterSetAdapter()))
883    {
884        scanner->emitError(XMLErrs::BadXMLEncoding, fEncodingStr);
885        return 0;
886    }
887
888    refreshRawBuffer(calculateRawBufIndex(), 0);
889
890    parser.init(fCharacterSetAdapter, fTranscoder, fCurLine, fCurCol, fXMLVersion);
891
892        fParser = (XMLParser*)&parser;
893
894        CALLGRIND_START_INSTRUMENTATION;
895
896    DEBUG_TRANSITION_MESSAGE("######################################################################");
897    DEBUG_TRANSITION_MESSAGE("SCANNING ELEMENT(S)")
898    DEBUG_TRANSITION_MESSAGE("######################################################################");
899
900    for (;;)
901    {
902        // transform the new raw data into symbol and content streams
903        parser.preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
904
905        parser.buildElementPage();
906
907        if (unlikely(!parser.scanElementPage())) break;
908
909        parser.prepareForNextDocumentPage();
910    }
911
912    DEBUG_TRANSITION_MESSAGE("######################################################################");
913    DEBUG_TRANSITION_MESSAGE("SCANNING MISCELLANEOUS")
914    DEBUG_TRANSITION_MESSAGE("######################################################################");
915
916    while (unlikely(parser.scanMiscellaneousPage()))
917    {
918        parser.prepareForNextDocumentPage();
919
920        // transform the new raw data into symbol and content streams
921        parser.preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
922    }
923
924
925        CALLGRIND_STOP_INSTRUMENTATION;
926        CALLGRIND_DUMP_STATS;
927
928        fNoMore = true;
929
930        fParser = 0;
931
932        return 1;
933}
934
935// -------------------------------------------------------------------------------------
936
937template <class XMLScannerType>
938IDISA_ALWAYS_INLINE
939void XMLReader::scanFirst( XMLScannerType * const scanner )
940{
941    DEBUG_MESSAGE("XMLReader::scanFirst()")
942
943    // construct the actual XMLParser object to parse the ELEMENT and MISC nodes
944    XMLParserImpl<XMLScannerType> * parser = new XMLParserImpl<XMLScannerType>(*this, *scanner, fMemoryManager);
945
946    //  Scan the PROLOG part, which is everything before the root element
947        //  including the DTD subsets.
948        scanner->scanProlog();
949
950        // now construct the character set adapter
951        if (unlikely(!constructCharacterSetAdapter()))
952        {
953                scanner->emitError(XMLErrs::BadXMLEncoding, fEncodingStr);
954        return;
955        }
956
957    refreshRawBuffer(calculateRawBufIndex(), 0);
958
959    parser->init(fCharacterSetAdapter, fTranscoder, fCurLine, fCurCol, fXMLVersion);
960
961    fParser = reinterpret_cast<XMLParser*>(parser);
962
963        // transform the new raw data into symbol and content streams
964    parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
965
966    parser->buildElementPage();
967}
968
969// -------------------------------------------------------------------------------------
970
971template <class XMLScannerType>
972IDISA_ALWAYS_INLINE
973bool XMLReader::scanNext()
974{
975    XMLParserImpl<XMLScannerType> * const parser = reinterpret_cast<XMLParserImpl<XMLScannerType>*>(fParser);
976
977    if (likely(parser->scanNext()))
978    {
979        return true;
980    }
981
982    if (likely(parser->inElement()))
983    {
984        DEBUG_MESSAGE(" **** CONTINUE ELEMENT SPAN **** ")
985
986        parser->prepareForNextDocumentPage();
987
988        parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
989
990        parser->buildElementPage();
991
992        if (likely(parser->scanNext()))
993        {
994            return true;
995        }
996    }
997
998    DEBUG_MESSAGE(" **** BEGIN MISCELLANEOUS SPAN **** ")
999
1000    while (unlikely(parser->scanMiscellaneousPage()))
1001    {
1002        // if there is no new data; we're done (and the file is likely in error)
1003        if (likely(fEOF)) break;
1004
1005        parser->prepareForNextDocumentPage();
1006
1007        // transform the new raw data into symbol and content streams
1008        parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
1009    }
1010
1011    delete fParser;
1012    fParser = 0;
1013    fNoMore = true;
1014    return 0;
1015}
1016
1017void XMLReader::fill(XMLBuffer & toFill)
1018{
1019    DEBUG_MESSAGE("XMLReader::fill(toFill) fCharIndex=" << fCharIndex << ", fCharsAvail=" << fCharsAvail)
1020
1021    do
1022    {
1023        toFill.append(&fCharBuf[fCharIndex], fCharsAvail - fCharIndex);
1024        fCharIndex = fCharsAvail;
1025    }
1026    while (refreshCharBuffer());
1027}
1028
1029// ---------------------------------------------------------------------------
1030// DEPRECATED FUNCTIONS
1031// ---------------------------------------------------------------------------
1032inline void XMLReader::movePlainContentChars(XMLBuffer &)
1033{
1034        DEPRECATED_FEATURE_IN_ICXML;
1035}
1036
1037inline bool XMLReader::getNextCharIfNot(const XMLCh, XMLCh &)
1038{
1039        DEPRECATED_FEATURE_IN_ICXML;
1040}
1041
1042inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) const
1043{
1044    DEPRECATED_FEATURE_IN_ICXML;
1045}
1046
1047inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) const
1048{
1049    DEPRECATED_FEATURE_IN_ICXML;
1050}
1051
1052#undef CHAR_SIZE
1053#undef SYMBOL_POSITION
1054#undef SYMBOL_ARRAY_SIZE
1055#undef CONSTRUCT_CHARACTER_SET_ADAPTER
1056
1057XERCES_CPP_NAMESPACE_END
1058
1059#endif
Note: See TracBrowser for help on using the repository browser.