source: icXML/icXML-devel/src/icxercesc/internal/XMLReader.hpp @ 3150

Last change on this file since 3150 was 3150, checked in by cameron, 6 years ago

Updates for various icxercesc modified files.

File size: 35.0 KB
Line 
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/*
19 * $Id: XMLReader.hpp 833045 2009-11-05 13:21:27Z borisk $
20 */
21
22#if !defined(XERCESC_INCLUDE_GUARD_XMLREADER_HPP)
23#define XERCESC_INCLUDE_GUARD_XMLREADER_HPP
24
25#include <icxercesc/util/XMLChar.hpp>
26#include <xercesc/framework/XMLErrorCodes.hpp>
27#include <icxercesc/framework/XMLRecognizer.hpp>
28#include <icxercesc/framework/XMLBuffer.hpp>
29#include <xercesc/util/TranscodingException.hpp>
30#include <icxercesc/util/TransService.hpp>
31#include <icxercesc/framework/XMLElementDecl.hpp>
32#include <xercesc/util/BinInputStream.hpp>
33#include <icxercesc/util/TransService.hpp>
34#include <icxmlc/XMLConfig.hpp>
35#include <icxmlc/Array.hpp>
36#include <icxmlc/XMLScanIterator.hpp>
37#include <icxmlc/XMLStreamIterator.hpp>
38#include <iostream>
39
40// WARNING: callgrind requires that valgrind is installed on your system. This include is not
41// necessary to use callgrind.
42#ifdef ENABLE_CALLGRIND_PROFILING
43#include <valgrind/callgrind.h>
44#else
45#define CALLGRIND_START_INSTRUMENTATION
46#define CALLGRIND_STOP_INSTRUMENTATION
47#define CALLGRIND_DUMP_STATS
48#endif
49
50XERCES_CPP_NAMESPACE_BEGIN
51
52class ReaderMgr;
53class InputSource;
54class XMLParser;
55class XMLReferenceTable;
56template <typename XMLScannerType> class XMLParserImpl;
57
58// -----------------------------------------------------------------------------------------------
59
60#if !defined(XERCESC_INCLUDE_GUARD_BININPUTSTREAM_HPP)
61class BinInputStream;
62#endif
63#if !defined(XERCESC_INCLUDE_GUARD_XMLSCANNER_HPP)
64class XMLScanner;
65#endif
66
67// ---------------------------------------------------------------------------
68//  Instances of this class are used to manage the content of entities. The
69//  scanner maintains a stack of these, one for each entity (this means entity
70//  in the sense of any parsed file or internal entity) currently being
71//  scanned. This class, given a binary input stream will handle reading in
72//  the data and decoding it from its external decoding into the internal
73//  Unicode format. Once internallized, this class provides the access
74//  methods to read in the data in various ways, maintains line and column
75//  information, and provides high performance character attribute checking
76//  methods.
77//
78//  This is NOT to be derived from.
79//
80// ---------------------------------------------------------------------------
81
82class XMLPARSER_EXPORT XMLReader : public XMemory
83{
84        friend class ReaderMgr;
85    friend class XMLReferenceTable;
86    friend class XMLParser;
87
88public:
89        // -----------------------------------------------------------------------
90        //  Public types
91        // -----------------------------------------------------------------------
92        enum Types
93        {
94                Type_PE
95                , Type_General
96        };
97
98        enum Sources
99        {
100                Source_Internal
101                , Source_External
102        };
103
104        enum RefFrom
105        {
106                RefFrom_Literal
107                , RefFrom_NonLiteral
108        };
109
110    enum XMLVersion
111    {
112        XMLV1_0
113        , XMLV1_1
114        , XMLV_Unknown
115    };
116
117        // -----------------------------------------------------------------------
118        //  Public, query methods
119        // -----------------------------------------------------------------------
120        bool isAllSpaces
121        (
122                const   XMLCh* const    toCheck
123                , const XMLSize_t       count
124        ) const;
125
126        bool containsWhiteSpace
127        (
128                const   XMLCh* const    toCheck
129                , const XMLSize_t       count
130        ) const;
131
132        bool isXMLLetter(const XMLCh toCheck) const;
133        bool isFirstNameChar(const XMLCh toCheck) const;
134        bool isNameChar(const XMLCh toCheck) const;
135        bool isPlainContentChar(const XMLCh toCheck) const;
136        bool isSpecialStartTagChar(const XMLCh toCheck) const;
137        bool isXMLChar(const XMLCh toCheck) const;
138        bool isWhitespace(const XMLCh toCheck) const;
139        bool isControlChar(const XMLCh toCheck) const;
140        bool isPublicIdChar(const XMLCh toCheck) const;
141        bool isFirstNCNameChar(const XMLCh toCheck) const;
142        bool isNCNameChar(const XMLCh toCheck) const;
143
144        // -----------------------------------------------------------------------
145        //  Constructors and Destructor
146        // -----------------------------------------------------------------------
147        XMLReader
148        (
149                const   XMLCh* const          pubId
150                , const XMLCh* const          sysId
151                ,       BinInputStream* const streamToAdopt
152                , const RefFrom               from
153                , const Types                 type
154                , const Sources               source
155                , const bool                  throwAtEnd
156                , const bool                  calculateSrcOfs
157                ,       XMLSize_t             lowWaterMark
158                , const XMLVersion            xmlVersion
159                ,       MemoryManager* const  manager
160        );
161
162        XMLReader
163        (
164                const   XMLCh* const          pubId
165                , const XMLCh* const          sysId
166                ,       BinInputStream* const streamToAdopt
167                , const XMLCh* const          encodingStr
168                , const RefFrom               from
169                , const Types                 type
170                , const Sources               source
171                , const bool                  throwAtEnd
172                , const bool                  calculateSrcOfs
173                ,       XMLSize_t             lowWaterMark
174                , const XMLVersion            xmlVersion
175                ,       MemoryManager* const  manager
176        );
177
178        XMLReader
179        (
180                const   XMLCh* const          pubId
181                , const XMLCh* const          sysId
182                ,       BinInputStream* const streamToAdopt
183                , XMLRecognizer::Encodings    encodingEnum
184                , const RefFrom               from
185                , const Types                 type
186                , const Sources               source
187                , const bool                  throwAtEnd
188                , const bool                  calculateSrcOfs
189                ,       XMLSize_t             lowWaterMark
190                , const XMLVersion            xmlVersion
191                ,       MemoryManager* const  manager
192        );
193
194        ~XMLReader();
195
196
197        // -----------------------------------------------------------------------
198        //  Character buffer management methods
199        // -----------------------------------------------------------------------
200        XMLSize_t charsLeftInBuffer() const;
201        bool refreshCharBuffer();
202
203        // -----------------------------------------------------------------------
204        //  Scanning methods
205        // -----------------------------------------------------------------------
206        bool getName(XMLBuffer& toFill, const bool token);
207        bool getQName(XMLBuffer& toFill, int* colonPosition);
208        bool getNCName(XMLBuffer& toFill);
209        bool getNextChar(XMLCh& chGotten);
210        bool getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten);
211        void movePlainContentChars(XMLBuffer &dest);
212        bool getSpaces(XMLBuffer& toFill);
213        bool getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck);
214        bool peekNextChar(XMLCh& chGotten);
215        bool skipIfQuote(XMLCh& chGotten);
216        bool skipSpaces(bool& skippedSomething, bool inDecl = false);
217        bool skippedChar(const XMLCh toSkip);
218    void skipChars(const XMLSize_t charsToSkip);
219        bool skippedSpace();
220        bool skippedString(const XMLCh* const toSkip);   
221        bool skippedStringLong(const XMLCh* toSkip);
222    const XMLCh * peekString(XMLSize_t length);
223        bool peekString(const XMLCh* const toPeek);
224    bool peekString(const XMLCh* const toPeek, XMLSize_t length, bool mustBeFollowedByWhitespace = false);
225
226
227        // -----------------------------------------------------------------------
228        //  Getter methods
229        // -----------------------------------------------------------------------
230        XMLFileLoc getColumnNumber() const;
231        const XMLCh* getEncodingStr() const;
232        XMLFileLoc getLineNumber() const;
233    XMLVersion getXMLVersion() const;
234        bool getNoMoreFlag() const;
235        const XMLCh* getPublicId() const;
236        XMLSize_t getReaderNum() const;
237        RefFrom getRefFrom() const;
238        Sources getSource() const;
239        XMLFilePos getSrcOffset() const;
240        const XMLCh* getSystemId() const;
241        bool getThrowAtEnd() const;
242        Types getType() const;
243
244
245        // -----------------------------------------------------------------------
246        //  Setter methods
247        // -----------------------------------------------------------------------
248        bool setEncoding
249        (
250                const   XMLCh* const    newEncoding
251        );
252        void setReaderNum(const XMLSize_t newNum);
253        void setThrowAtEnd(const bool newValue);
254        void setXMLVersion(const XMLVersion version);
255
256        // ---------------------------------------------------------------------------
257        //  Class Constants
258        //
259        //  kCharBufSize
260        //      The size of the character spool buffer that we use. Its not terribly
261        //      large because its just getting filled with data from a raw byte
262        //      buffer as we go along. We don't want to decode all the text at
263        //      once before we find out that there is an error.
264        //
265        //      NOTE: This is a size in characters, not bytes.
266        //
267        //  kRawBufSize
268        //      The size of the raw buffer from which raw bytes are spooled out
269        //      as we transcode chunks of data. As it is emptied, it is filled back
270        //      in again from the source stream.
271        // ---------------------------------------------------------------------------
272        enum Constants
273        {
274                kCharBufSize        = 128
275                , kRawBufSize       = BUFFER_BLOCKS * BLOCK_SIZE
276        };
277
278        // -----------------------------------------------------------------------
279        // PARABIX SCANNING FUNCTIONS
280        // -----------------------------------------------------------------------
281
282        template <class XMLScannerType>
283        bool scanDocument( XMLScannerType * const scanner );
284
285        template <class XMLScannerType>
286    void scanFirst(XMLScannerType * const scanner);
287
288        template <class XMLScannerType>
289    bool scanNext();
290
291        void refreshRawBuffer(const XMLSize_t offset);
292
293        void refreshRawBuffer(const XMLSize_t from, const XMLSize_t to);
294
295        void getCurrentLineColumn(XMLFileLoc & line, XMLFileLoc & col) const;
296
297    inline void fill(XMLBuffer & toFill);
298
299private:
300        // -----------------------------------------------------------------------
301        //  Unimplemented constructors and operators
302        // -----------------------------------------------------------------------
303        XMLReader(const XMLReader&);
304        XMLReader& operator=(const XMLReader&);
305
306        // -----------------------------------------------------------------------
307        //  Private helper methods
308        // -----------------------------------------------------------------------
309
310        void doInitCharSizeChecks();
311
312        void doInitDecode();
313
314        bool constructTranscoder();
315
316        bool constructCharacterSetAdapter();
317
318        XMLByte getNextRawByte
319        (
320                const   bool            eoiOk
321        );
322
323        void setTranscoder
324        (
325                const   XMLCh* const    newEncoding
326        );
327
328        XMLSize_t xcodeMoreChars
329        (
330                                XMLCh* const            bufToFill
331                ,       unsigned char* const    charSizes
332                , const XMLSize_t               maxChars
333        );
334
335    void handleEOL(XMLCh & curCh, bool inDecl = false);
336
337        void refreshRawBuffer();
338
339    XMLSize_t calculateRawBufIndex();
340
341        void checkForSwapped();
342
343    void normalizeEOL(XMLCh* const bufToFill, unsigned char* const charSizes, XMLSize_t & charsDone, XMLSize_t & bytesEaten);
344
345        // -----------------------------------------------------------------------
346        //  Data members
347        //
348        //  fCharIndex
349        //      The index into the character buffer. When this hits fCharsAvail
350        //      then its time to refill.
351        //
352        //  fCharBuf
353        //      A buffer that the reader manager fills up with transcoded
354        //      characters a small amount at a time.
355        //
356        //  fCharsAvail
357        //      The characters currently available in the character buffer.
358        //
359        //  fCharSizeBuf
360        //      This buffer is an array that contains the number of source chars
361        //      eaten to create each char in the fCharBuf buffer. So the entry
362        //      fCharSizeBuf[x] is the number of source chars that were eaten
363        //      to make the internalized char fCharBuf[x]. This only contains
364        //      useful data if fSrcOfsSupported is true.
365        //
366        //  fCharOfsBuf
367        //      This buffer is an array that contains the offset in the
368        //      fRawByteBuf buffer of each char in the fCharBuf buffer. It
369        //      only contains useful data if fSrcOfsSupported is true.
370        //
371        //  fCurCol
372        //  fCurLine
373        //      The current line and column that we are in within this reader's
374        //      text.
375        //
376        //  fEncoding
377        //      This is the rough encoding setting. This enum is set during
378        //      construction and just tells us the rough family of encoding that
379        //      we are doing.
380        //
381        //  fEncodingStr
382        //      This is the name of the encoding we are using. It will be
383        //      provisionally set during construction, from the auto-sensed
384        //      encoding. But it might be overridden when the XMLDecl is finally
385        //      seen by the scanner. It can also be forced to a particular
386        //      encoding, in which case fForcedEncoding is set.
387        //
388        //  fForcedEncoding
389        //      If the encoding if forced then this is set and all other
390        //      information will be ignored. This encoding will be taken as
391        //      gospel. This is done by calling an alternate constructor.
392        //
393        //  fNoMore
394    //      This is set when the transcoded text is exhausted and no more can
395    //      be obtained from the fStream.
396    //
397    //  fEOF
398    //      This is set when the source input is exhausted.
399        //
400        //  fRawBufIndex
401        //      The current index into the raw byte buffer. When its equal to
402        //      fRawBytesAvail then we need to read another buffer.
403        //
404        //  fRawByteBuf
405        //      This is the raw byte buffer that is used to spool out bytes
406        //      from into the fCharBuf buffer, as we transcode in blocks.
407        //
408        //  fRawBytesAvail
409        //      The number of bytes currently available in the raw buffer. This
410        //      helps deal with the last buffer's worth, which will usually not
411        //      be a full one.
412        //
413        //  fReaderNum
414        //      Each reader from a particular reader manager (which means from a
415        //      particular document) is given a unique number. The reader manager
416        //      sets these numbers. They are used to catch things like partial
417        //      markup errors.
418        //
419        //  fRefFrom
420        //      This flag is provided in the ctor, and tells us if we represent
421        //      some entity being expanded inside a literal. Sometimes things
422        //      happen differently inside and outside literals.
423        //
424        //  fPublicId
425        //  fSystemId
426        //      These are the system and public ids of the source that this
427        //      reader is reading.
428        //
429        //  fSource
430        //      Indicates whether the content this reader is spooling as already
431        //      been internalized. This will prevent multiple processing of
432        //      whitespace when an already internalized entity is being spooled
433        //      out.
434        //
435        //  fSpareChar
436        //      Some encodings can create two chars in an atomic way, e.g.
437        //      surrogate pairs. We might not be able to store both, so we store
438        //      it here until the next buffer transcoding operation.
439        //
440        //  fSrcOfsBase
441        //      This is the base offset within the source of this entity. Values
442        //      in the curent fCharSizeBuf array are relative to this value.
443        //
444        //  fSrcOfsSupported
445        //      This flag is set to indicate whether source byte offset info
446        //      is supported. For intrinsic encodings, its always set since we
447        //      can always support it. For transcoder based encodings, we ask
448        //      the transcoder if it supports it or not.
449        //
450        //  fStream
451        //      This is the input stream that provides the data for the reader.
452        //      Its always treated as a raw byte stream. The derived class will
453        //      ask for buffers of text from it and will handle making some
454        //      sense of it.
455        //
456        //  fSwapped
457        //      If the encoding is one of the ones we do intrinsically, and its
458        //      in a different byte order from our native order, then this is
459        //      set to remind us to byte swap it during transcoding.
460        //
461        //  fThrowAtEnd
462        //      Indicates whether the reader manager should throw an end of entity
463        //      exception at the end of this reader instance. This is usually
464        //      set for top level external entity references. It overrides the
465        //      reader manager's global flag that controls throwing at the end
466        //      of entities. Defaults to false.
467        //
468        //  fTranscoder
469        //      If the encoding is not one that we handle intrinsically, then
470        //      we use an an external transcoder to do it. This class is an
471        //      abstraction that allows us to use pluggable external transcoding
472        //      services (via XMLTransService in util.)
473        //
474        //  fType
475        //      Indicates whether this reader represents a PE or not. If this
476        //      flag is true and the fInLiteral flag is false, then we will put
477        //      out an extra space at the end.
478        //
479        //
480        //  fNEL
481    //      Boolean indicates if NELs and LSEPs should be recognized as EOLs
482        //
483        //  fXMLVersion
484        //      Enum to indicate if this Reader is conforming to XML 1.0 or XML 1.1
485        // -----------------------------------------------------------------------
486
487        XMLSize_t                   fCharIndex;
488    XMLCh                       fCharBuf[kCharBufSize + 1];
489        XMLSize_t                   fCharsAvail;
490    unsigned char               fCharSizeBuf[kCharBufSize + 1];
491        XMLFileLoc                  fCurCol;
492        XMLFileLoc                  fCurLine;
493        XMLRecognizer::Encodings    fEncoding;
494        const XMLCh *               fEncodingStr;
495        bool                        fForcedEncoding;
496        bool                        fNoMore;
497    bool                        fEOF;
498        XMLCh*                      fPublicId;
499        XMLSize_t                   fRawBufIndex;
500        XMLByte                     fRawByteBuf[kRawBufSize + sizeof(BytePack)];
501        XMLSize_t                   fRawBytesAvail;
502        XMLSize_t                   fRawBytesRead;
503        XMLSize_t                   fReaderNum;
504        RefFrom                     fRefFrom;
505        Sources                     fSource;
506        XMLFilePos                  fSrcOfsBase;
507        XMLCh*                      fSystemId;
508        BinInputStream*             fStream;
509        bool                        fSwapped;
510        bool                        fThrowAtEnd;
511        XMLTranscoder*              fTranscoder;
512        XMLCharacterSetAdapter*     fCharacterSetAdapter;
513        Types                       fType;
514        bool                        fNEL;
515        XMLVersion                  fXMLVersion;
516        MemoryManager*              fMemoryManager;
517        // the internal parser is stored as void to bypass the fact the parser is templated
518        XMLParser*                  fParser;
519};
520
521// ---------------------------------------------------------------------------
522//  XMLReader: Public, query methods
523// ---------------------------------------------------------------------------
524inline bool XMLReader::isNameChar(const XMLCh toCheck) const
525{
526    return XMLNameChar::isNameChar(toCheck);
527}
528
529inline bool XMLReader::isNCNameChar(const XMLCh toCheck) const
530{
531    return XMLNameChar::isNameChar(toCheck) && toCheck != chColon;
532}
533
534inline bool XMLReader::isFirstNameChar(const XMLCh toCheck) const
535{
536    return XMLNameChar::isNameStart(toCheck);
537}
538
539inline bool XMLReader::isFirstNCNameChar(const XMLCh toCheck) const
540{
541    return XMLNameChar::isNameStart(toCheck) && toCheck != chColon;
542}
543
544inline bool XMLReader::isXMLChar(const XMLCh toCheck) const
545{
546    return XMLNameChar::isXMLChar(toCheck, fXMLVersion == XMLReader::XMLV1_0);
547}
548
549inline bool XMLReader::isXMLLetter(const XMLCh toCheck) const
550{
551    return isNCNameChar(toCheck) && (toCheck != chUnderscore);
552}
553
554inline bool XMLReader::isWhitespace(const XMLCh toCheck) const
555{
556    return XMLNameChar::isWhitespace(toCheck, !fNEL);
557}
558
559inline bool XMLReader::isControlChar(const XMLCh toCheck) const
560{
561    return (toCheck < 0x20) && XMLNameChar::isXMLChar(toCheck, fXMLVersion == XMLReader::XMLV1_0);
562}
563
564// ---------------------------------------------------------------------------
565//  XMLReader: Buffer management methods
566// ---------------------------------------------------------------------------
567inline XMLSize_t XMLReader::charsLeftInBuffer() const
568{
569        return fCharsAvail - fCharIndex;
570}
571
572// ---------------------------------------------------------------------------
573//  XMLReader: Getter methods
574// ---------------------------------------------------------------------------
575inline XMLFileLoc XMLReader::getColumnNumber() const
576{
577    return fCurCol;
578}
579
580inline const XMLCh* XMLReader::getEncodingStr() const
581{
582        return fEncodingStr;
583}
584
585inline XMLFileLoc XMLReader::getLineNumber() const
586{
587    return fCurLine;
588}
589
590inline XMLReader::XMLVersion XMLReader::getXMLVersion() const
591{
592    return fXMLVersion;
593}
594
595inline bool XMLReader::getNoMoreFlag() const
596{
597    return fNoMore;
598}
599
600inline const XMLCh* XMLReader::getPublicId() const
601{
602        return fPublicId;
603}
604
605inline XMLSize_t XMLReader::getReaderNum() const
606{
607        return fReaderNum;
608}
609
610inline XMLReader::RefFrom XMLReader::getRefFrom() const
611{
612        return fRefFrom;
613}
614
615inline XMLReader::Sources XMLReader::getSource() const
616{
617        return fSource;
618}
619
620inline const XMLCh* XMLReader::getSystemId() const
621{
622        return fSystemId;
623}
624
625inline bool XMLReader::getThrowAtEnd() const
626{
627        return fThrowAtEnd;
628}
629
630inline XMLReader::Types XMLReader::getType() const
631{
632        return fType;
633}
634
635// ---------------------------------------------------------------------------
636//  XMLReader: Setter methods
637// ---------------------------------------------------------------------------
638inline void XMLReader::setReaderNum(const XMLSize_t newNum)
639{
640        fReaderNum = newNum;
641}
642
643inline void XMLReader::setThrowAtEnd(const bool newValue)
644{
645        fThrowAtEnd = newValue;
646}
647
648inline void XMLReader::setXMLVersion(const XMLVersion version)
649{
650        fXMLVersion = version;
651    const bool NEL = (version == XMLV1_1) ? true : XMLChar1_0::enableNEL;
652    if (unlikely(NEL & !fNEL))
653    {
654        XMLSize_t avail = (fCharsAvail - fCharIndex);
655        XMLSize_t bytesEaten;
656        normalizeEOL(&fCharBuf[fCharIndex], &fCharSizeBuf[fCharIndex], avail, bytesEaten);
657        fCharsAvail = fCharIndex + avail;
658    }
659    fNEL = NEL;
660}
661
662// ---------------------------------------------------------------------------
663//  XMLReader: getNextChar() method inlined for speed
664// ---------------------------------------------------------------------------
665inline bool XMLReader::getNextChar(XMLCh& chGotten)
666{
667        //
668        //  See if there is at least a char in the buffer. Else, do the buffer
669        //  reload logic.
670        //
671    if (unlikely(fCharIndex == fCharsAvail))
672        {
673                if (unlikely(!refreshCharBuffer()))
674                        return false;
675        }
676
677        chGotten = fCharBuf[fCharIndex++];
678    handleEOL(chGotten, false);
679        return true;
680}
681
682inline void XMLReader::skipChars(const XMLSize_t charsToSkip)
683{
684    XMLCh temp;
685    for (XMLSize_t c = charsToSkip; c-- && getNextChar(temp); );
686}
687
688// ---------------------------------------------------------------------------
689//  XMLReader: peekNextChar() method inlined for speed
690// ---------------------------------------------------------------------------
691inline bool XMLReader::peekNextChar(XMLCh& chGotten)
692{
693        //  If there is something still in the buffer, get it. Else do the reload
694        //  scenario.
695        //
696    if (unlikely(fCharIndex == fCharsAvail))
697        {
698                // Try to refresh the buffer
699                if (unlikely(!refreshCharBuffer()))
700                {
701                        chGotten = chNull;
702                        return false;
703                }
704        }
705
706        chGotten = fCharBuf[fCharIndex];
707
708        //
709        //  Even though we are only peeking, we have to act the same as the
710        //  normal char get method in regards to newline normalization, though
711        //  its not as complicated as the actual character getting method's.
712        //
713        if ((chGotten == chCR || (fNEL && (chGotten == chNEL || chGotten == chLineSeparator)))
714                && (fSource == Source_External))
715                chGotten = chLF;
716
717        return true;
718}
719
720/***
721 *
722 * XML1.1
723 *
724 * 2.11 End-of-Line Handling
725 *
726 *    XML parsed entities are often stored in computer files which, for editing
727 *    convenience, are organized into lines. These lines are typically separated
728 *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
729 *
730 *    To simplify the tasks of applications, the XML processor MUST behave as if
731 *    it normalized all line breaks in external parsed entities (including the document
732 *    entity) on input, before parsing, by translating all of the following to a single
733 *    #xA character:
734 *
735 *  1. the two-character sequence #xD #xA
736 *  2. the two-character sequence #xD #x85
737 *  3. the single character #x85
738 *  4. the single character #x2028
739 *  5. any #xD character that is not immediately followed by #xA or #x85.
740 *
741 *
742 ***/
743
744inline void XMLReader::normalizeEOL(XMLCh* const bufToFill, unsigned char* const charSizes, XMLSize_t & charsDone, XMLSize_t & bytesEaten)
745{
746    ubitblock deletedAnyCharacters = {0};
747    // do end-of-line normalization on the transcoded input data
748    XMLChIterator<chCR> crItr(bufToFill, charsDone);
749
750    size_t pos = 0;
751
752    if (likely(!fNEL))
753    {
754        // do XML 1.0 end of line normalization
755        while (unlikely(crItr.next()))
756        {
757            pos = crItr.pos();
758            bufToFill[pos++] = chLF;
759            // did this buffer end with a CR?
760            if (unlikely(pos == kCharBufSize))
761            {
762                // push back the CR just to be sure that we arent dealing with a CRLF
763                bytesEaten -= charSizes[--charsDone];
764                DEBUG_MESSAGE(" -- pushback(CR)")
765                break;
766            }
767            if (unlikely(bufToFill[pos] == chLF))
768            {
769                // if we found a {CR,LF}, then mark the LF for deletion
770                #ifdef __ARCH_64
771                deletedAnyCharacters._64[pos >> CONST_LOG_2(64)] |= (static_cast<uint64_t>(1) << (pos & 63));
772                #else
773                deletedAnyCharacters._32[pos >> CONST_LOG_2(32)] |= (static_cast<uint32_t>(1) << (pos & 31));
774                #endif
775            }
776        }
777    }
778    else // if (fScanner.getXMLVersion() == XMLReader::XMLV1_1) // can only be this; would be a decl error otherwise
779    {
780        // do XML 1.1 specific end of line normalization
781        while (unlikely(crItr.next()))
782        {
783            pos = crItr.pos();
784            bufToFill[pos++] = chLF;
785            // did this buffer end with a CR?
786            if (unlikely(pos == kCharBufSize))
787            {
788                // push back the CR just to be sure that we arent dealing with a CRLF
789                bytesEaten -= charSizes[--charsDone];
790                DEBUG_MESSAGE(" -- pushback(CR)")
791                break;
792            }
793            if (unlikely((bufToFill[pos] == chLF) || (bufToFill[pos] == chNEL)))
794            {
795                // if we found a {CR,LF} or a {CR,NEL}, then mark the LF/NEL for deletion
796                #ifdef __ARCH_64
797                deletedAnyCharacters._64[pos >> CONST_LOG_2(64)] |= (static_cast<uint64_t>(1) << (pos & 63));
798                #else
799                deletedAnyCharacters._32[pos >> CONST_LOG_2(32)] |= (static_cast<uint32_t>(1) << (pos & 31));
800                #endif
801            }
802        }
803
804        XMLChIterator2<chNEL, chLineSeparator> nonLfItr(bufToFill, charsDone);
805        while (unlikely(nonLfItr.next()))
806        {
807            bufToFill[nonLfItr.pos()] = chLF;
808        }
809    }
810
811    // remove all deleted characters from the semi-final production
812    if (unlikely(bitblock::any(deletedAnyCharacters._128)))
813    {
814        XMLStreamIterator itr(deletedAnyCharacters);
815        // get the position of the first deleted character
816        itr.next();
817        size_t outputPos = itr.pos();
818        size_t lastPos;
819        // now scan for any other deleted characters
820        for (; lastPos = itr.pos() + 1, itr.next(); )
821        {
822            const size_t len = itr.pos() - lastPos;
823            charSizes[outputPos - 1] += charSizes[lastPos - 1];
824            Array<unsigned char>::move(&charSizes[lastPos], &charSizes[outputPos], len);
825
826            Array<XMLCh>::move(&bufToFill[lastPos], &bufToFill[outputPos], len);
827
828            outputPos += len;
829        }
830        // no more deleted characters; just append the remainder of the string
831        const size_t len = charsDone - lastPos;
832        charSizes[outputPos - 1] += charSizes[lastPos - 1];
833        Array<unsigned char>::move(&charSizes[lastPos], &charSizes[outputPos], len);
834        Array<XMLCh>::move(&bufToFill[lastPos], &bufToFill[outputPos], len);               
835        charsDone = outputPos + len;
836    }
837}
838
839inline void XMLReader::handleEOL(XMLCh & curCh, bool inDecl)
840{
841    // Handle line/col tracking for any character-at-a-time parsing modes.
842    if (unlikely(inDecl))
843    {
844        if (curCh == chNEL || curCh == chLineSeparator)
845        {
846            /***
847             * XML1.1
848             *
849             * 2.11 End-of-Line Handling
850             *  ...
851             *   The characters #x85 and #x2028 cannot be reliably recognized and translated
852             *   until an entity's encoding declaration (if present) has been read.
853             *   Therefore, it is a fatal error to use them within the XML declaration or
854             *   text declaration.
855             *
856             ***/
857            ThrowXMLwithMemMgr1
858            (
859                TranscodingException
860                , XMLExcepts::Reader_NelLsepinDecl
861                , fSystemId
862                , fMemoryManager
863            );
864        }
865    }
866
867    if (unlikely(curCh == chLF))
868    {
869        fCurCol = 1;
870        fCurLine++;
871    }
872    else
873    {
874        fCurCol++;
875    }
876}
877
878inline XMLSize_t XMLReader::calculateRawBufIndex()
879{
880    // See if we have any existing chars.
881    XMLSize_t spareChars = fCharsAvail - fCharIndex;
882    XMLSize_t rawBufIndex = fRawBufIndex;
883    //
884    //  If there are spare chars, then adjust the raw buffer index
885    //
886    if (spareChars)
887    {
888        XMLSize_t offset = fCharSizeBuf[fCharIndex];
889        for (XMLSize_t index = fCharIndex + 1; --spareChars; index++)
890        {
891            offset += fCharSizeBuf[index];
892        }
893        rawBufIndex -= offset;
894        DEBUG_MESSAGE(" --- fRawBufIndex'=" << fRawBufIndex)
895    }
896    return rawBufIndex;
897}
898
899// ---------------------------------------------------------------------------------------
900
901XERCES_CPP_NAMESPACE_END
902
903#include <icxmlc/XMLParser.hpp>
904
905XERCES_CPP_NAMESPACE_BEGIN
906
907// ---------------------------------------------------------------------------------------
908
909template<class XMLScannerType>
910bool
911XMLReader::scanDocument( XMLScannerType * const scanner )
912{
913    DEBUG_MESSAGE("XMLReader::scanDocument()")
914
915    // construct the actual XMLParser object to parse the ELEMENT and MISC nodes
916    XMLParserImpl<XMLScannerType> parser(scanner, fMemoryManager);
917
918    //  Scan the PROLOG part, which is everything before the root element
919        //  including the DTD subsets.
920        scanner->scanProlog();
921
922    // now construct the character set adapter
923    if (unlikely(!constructCharacterSetAdapter()))
924    {
925        scanner->emitError(XMLErrs::BadXMLEncoding, fEncodingStr);
926        return 0;
927    }
928
929    refreshRawBuffer(calculateRawBufIndex(), 0);
930
931    parser.init(fCharacterSetAdapter, fTranscoder, fCurLine, fCurCol, fXMLVersion);
932
933        fParser = (XMLParser*)&parser;
934
935        CALLGRIND_START_INSTRUMENTATION;
936
937    DEBUG_TRANSITION_MESSAGE("######################################################################");
938    DEBUG_TRANSITION_MESSAGE("SCANNING ELEMENT(S)")
939    DEBUG_TRANSITION_MESSAGE("######################################################################");
940
941    for (;;)
942    {
943        // transform the new raw data into symbol and content streams
944        parser.preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
945
946        parser.buildElementPage();
947
948        if (unlikely(!parser.scanElementPage())) break;
949
950        parser.prepareForNextDocumentPage();
951    }
952
953    DEBUG_TRANSITION_MESSAGE("######################################################################");
954    DEBUG_TRANSITION_MESSAGE("SCANNING MISCELLANEOUS")
955    DEBUG_TRANSITION_MESSAGE("######################################################################");
956
957    while (unlikely(parser.scanMiscellaneousPage()))
958    {
959        parser.prepareForNextDocumentPage();
960
961        // transform the new raw data into symbol and content streams
962        parser.preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
963    }
964
965
966        CALLGRIND_STOP_INSTRUMENTATION;
967        CALLGRIND_DUMP_STATS;
968
969        fNoMore = true;
970
971        fParser = 0;
972
973        return 1;
974}
975
976// -------------------------------------------------------------------------------------
977
978template <class XMLScannerType>
979IDISA_ALWAYS_INLINE
980void XMLReader::scanFirst( XMLScannerType * const scanner )
981{
982    DEBUG_MESSAGE("XMLReader::scanFirst()")
983
984    // construct the actual XMLParser object to parse the ELEMENT and MISC nodes
985    XMLParserImpl<XMLScannerType> * parser = new XMLParserImpl<XMLScannerType>(scanner, fMemoryManager);
986
987    //  Scan the PROLOG part, which is everything before the root element
988        //  including the DTD subsets.
989        scanner->scanProlog();
990
991        // now construct the character set adapter
992        if (unlikely(!constructCharacterSetAdapter()))
993        {
994                scanner->emitError(XMLErrs::BadXMLEncoding, fEncodingStr);
995        return;
996        }
997
998    refreshRawBuffer(calculateRawBufIndex(), 0);
999
1000    parser->init(fCharacterSetAdapter, fTranscoder, fCurLine, fCurCol, fXMLVersion);
1001
1002    fParser = reinterpret_cast<XMLParser*>(parser);
1003
1004        // transform the new raw data into symbol and content streams
1005    parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
1006
1007    parser->buildElementPage();
1008}
1009
1010// -------------------------------------------------------------------------------------
1011
1012template <class XMLScannerType>
1013IDISA_ALWAYS_INLINE
1014bool XMLReader::scanNext()
1015{
1016    XMLParserImpl<XMLScannerType> * const parser = reinterpret_cast<XMLParserImpl<XMLScannerType>*>(fParser);
1017
1018    if (likely(parser->scanNext()))
1019    {
1020        return true;
1021    }
1022
1023    if (likely(parser->inElement()))
1024    {
1025        DEBUG_MESSAGE(" **** CONTINUE ELEMENT SPAN **** ")
1026
1027        parser->prepareForNextDocumentPage();
1028
1029        parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
1030
1031        parser->buildElementPage();
1032
1033        if (likely(parser->scanNext()))
1034        {
1035            return true;
1036        }
1037    }
1038
1039    DEBUG_MESSAGE(" **** BEGIN MISCELLANEOUS SPAN **** ")
1040
1041    while (unlikely(parser->scanMiscellaneousPage()))
1042    {
1043        // if there is no new data; we're done (and the file is likely in error)
1044        if (likely(fEOF)) break;
1045
1046        parser->prepareForNextDocumentPage();
1047
1048        // transform the new raw data into symbol and content streams
1049        parser->preScanDocumentPage(fRawByteBuf, fRawBytesAvail, fEOF, this);
1050    }
1051
1052    delete fParser;
1053    fParser = 0;
1054    fNoMore = true;
1055    return 0;
1056}
1057
1058inline
1059void XMLReader::getCurrentLineColumn(XMLFileLoc & line, XMLFileLoc & col) const
1060{
1061        if (likely(fParser != NULL))
1062        {
1063                fParser->getCurrentLineCol(line, col);
1064        }
1065        else // we're in Xerces prolog mode!
1066        {
1067                line = fCurLine;
1068                col = fCurCol;
1069        }
1070}
1071
1072void XMLReader::fill(XMLBuffer & toFill)
1073{
1074    DEBUG_MESSAGE("XMLReader::fill(toFill) fCharIndex=" << fCharIndex << ", fCharsAvail=" << fCharsAvail)
1075
1076    do
1077    {
1078        toFill.append(&fCharBuf[fCharIndex], fCharsAvail - fCharIndex);
1079        fCharIndex = fCharsAvail;
1080    }
1081    while (refreshCharBuffer());
1082}
1083
1084// ---------------------------------------------------------------------------
1085// DEPRECATED FUNCTIONS
1086// ---------------------------------------------------------------------------
1087inline void XMLReader::movePlainContentChars(XMLBuffer &)
1088{
1089        DEPRECATED_FEATURE_IN_ICXML;
1090}
1091
1092inline bool XMLReader::getNextCharIfNot(const XMLCh, XMLCh &)
1093{
1094        DEPRECATED_FEATURE_IN_ICXML;
1095}
1096
1097inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) const
1098{
1099    DEPRECATED_FEATURE_IN_ICXML;
1100}
1101
1102inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) const
1103{
1104    DEPRECATED_FEATURE_IN_ICXML;
1105}
1106
1107#undef CHAR_SIZE
1108#undef SYMBOL_POSITION
1109#undef SYMBOL_ARRAY_SIZE
1110#undef CONSTRUCT_CHARACTER_SET_ADAPTER
1111
1112XERCES_CPP_NAMESPACE_END
1113
1114#endif
Note: See TracBrowser for help on using the repository browser.