source: icXML/icXML-devel/src/icxmlc/XMLParserImpl.c @ 3563

Last change on this file since 3563 was 3563, checked in by cameron, 5 years ago

Update icxmlc files

File size: 17.8 KB
Line 
1/*
2 *  Copyright © 2012 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icXML is a trademark of International Characters.
5 */
6
7/*
8 * @author Nigel Medforth, nigelm -at- interational-characters.com
9 * @version $Id: XMLParserImpl.c 377 2013-09-23 23:13:04Z nigelm $
10 *
11 */
12
13#include <icxmlc/XMLParserImpl.hpp>
14#include <icxercesc/util/TransService.hpp>
15#include <icxmlc/XMLUTF16CharacterSetAdapter.hpp>
16#include <icxmlc/parsers/XMLSchemaLoader.hpp>
17#include <icxmlc/parsers/XMLNamespaceParser.hpp>
18#include <icxmlc/parsers/XMLWellFormednessParser.hpp>
19#include <icxmlc/parsers/XMLGrammarValidator.hpp>
20#include <icxmlc/parsers/XMLIdentityConstraintValidator.hpp>
21
22XERCES_CPP_NAMESPACE_BEGIN
23
24// ---------------------------------------------------------------------------------------------------------
25
26template<class XMLScannerType>
27void
28XMLParserImpl<XMLScannerType>::
29init
30(
31        XMLCharacterSetAdapter *        adapter
32        , XMLTranscoder *                       transcoder
33        , const XMLFileLoc                      line
34        , const XMLFileLoc                      column
35    , const XMLVersion          /* version */
36)
37{
38        // reset the curr and next line col trackers to the actual position
39        fCharacterSetAdapter = adapter;
40        fSymbolTable.setTranscoder(transcoder);
41    fReferenceTable.setTranscoder(transcoder);
42    adapter->init(&fScanner, fSymbolTable, fReferenceTable, line, column);
43}
44
45// ---------------------------------------------------------------------------------------------------------
46
47template<class XMLScannerType>
48void
49XMLParserImpl<XMLScannerType>::
50preScanDocumentPage
51(
52    XMLByte                                     *       input
53    , const XMLSize_t                   avail
54        , const bool                            noMore
55        , XMLReader                             *       reader
56)
57{
58        /** ---------------------------------- BEGIN PARALLEL XML PARSING ---------------------------------- **/
59
60    DEBUG_MESSAGE("######################################################################");
61    DEBUG_MESSAGE("BEGIN XML PARSING SCAN: avail=" << avail << " noMore=" << noMore);
62    DEBUG_MESSAGE("######################################################################");
63
64    fNoMore = noMore;
65
66    size_t bytesEaten;
67
68    fCursorEndPtr = &fContentStream[fContentIdx];
69
70    fCharacterSetAdapter->parse
71    (
72        input
73        , avail
74        , noMore
75        , *this
76        , bytesEaten
77    );
78
79        fSymbolIdx = 0;
80    fReferenceIdx = 0;
81    fStringEndPtr = &fStringEndStream[0];
82    fContentIdx = 0;
83
84    if (unlikely(fReferenceCount > 0 && !fEntityContentBuffer))
85    {
86        fEntityContentBuffer = new (fScanner.getMemoryManager()) XMLBuffer(1023, fScanner.getMemoryManager());
87    }
88
89        /** ----------------------------------- BUFFER SYMBOL STREAM ----------------------------------- **/
90
91    // this could be threaded; would need a lock above the parallel xml parsing section.
92
93        if (likely(!noMore))
94        {
95        // what's the earliest symbol/ref position that we need to copy back? all data after this
96        // point must be copied back and will not be modified or inspected when
97                // parsing the next document page.
98        reader->refreshRawBuffer(bytesEaten, 0);
99        }
100
101}
102
103// -------------------------------------------------------------------------------------------
104
105template<class XMLScannerType>
106void
107XMLParserImpl<XMLScannerType>::
108scanInternalDocumentPage
109(
110    const XMLCh *                       input
111    , const XMLSize_t                                   length
112    , XMLReplacementText &              toFill
113)
114{
115    enum { UTF16_SEGMENT_SIZE = ((SEGMENT_SIZE) / sizeof(XMLCh)) };
116
117    size_t offset = 0;
118
119    fSymbolCount = 0;
120    fReferenceCount = 0;
121    fStringCount = 0;
122    fMarkupCount = 0;
123    fCursorEndPtr = &fContentStream[0];
124
125    XMLUTF16CharacterSetAdapter * adapter =
126        reinterpret_cast<XMLUTF16CharacterSetAdapter*>(fCharacterSetAdapter);
127
128
129    // TODO: potential line column tracking issue: will this handle extremely long sequences of replacement text?
130
131    for (;;)
132    {
133        size_t avail = length - offset;
134        fNoMore = 1;
135        if (unlikely(avail > UTF16_SEGMENT_SIZE))
136        {
137            avail = UTF16_SEGMENT_SIZE;
138            fNoMore = 0;
139        }
140
141        DEBUG_MESSAGE(" **** SCAN INTERNAL PAGE @ " << offset << " -> avail=" << avail << " noMore=" << fNoMore)
142
143        size_t bytesEaten = 0;
144
145        adapter->parseInternal
146        (
147            &input[offset]
148            , avail
149            , fNoMore
150            , *this
151            , bytesEaten
152        );
153
154        if (fNoMore) break;
155
156        offset += bytesEaten;
157
158        DEBUG_MESSAGE(" **** NEXT INTERNAL PAGE @ " << offset << "! ****")
159
160        /** ----------------------- BUFFER CONTENT STREAM --------------------------- **/
161
162        // stringEndOffset is required to adjust the string end position of the unused strings
163        ptrdiff_t cursorOffset = (fCursorEndPtr - &fContentStream[0]);
164
165        if (fContentStream.capacity() < (cursorOffset + UTF16_SEGMENT_SIZE))
166        {
167            const XMLCh * const contentBuf0 = &fContentStream[0];
168
169            fContentStream.resizeToFit(cursorOffset, fContentStream.capacity() * 2);
170
171            fCursorEndPtr = &fContentStream[cursorOffset];
172
173            // just incase the content buffer was expanded, adjust the string end offset
174            fStringEndStream.adjust(0, fStringCount, contentBuf0 - &fContentStream[0]);
175        }
176
177        /** -------------------- MOVE STRING END POINTERS BACK ------------------------ **/
178
179        if (fStringEndStream.capacity() < (fStringCount + (UTF16_SEGMENT_SIZE / 4)))
180        {
181            fStringEndStream.resizeToFit(fStringCount, fStringEndStream.capacity() * 2);
182        }
183
184        /** ------------------------ BUFFER SYMBOL STREAM ------------------------------ **/
185
186        if (fSymbolStream.capacity() < (fSymbolCount + (UTF16_SEGMENT_SIZE / 2)))
187        {
188            fSymbolStream.resizeToFit(fSymbolCount, fSymbolStream.capacity() * 2);
189        }
190
191        /** ----------------------- BUFFER REFERENCE STREAM ---------------------------- **/
192
193        if (fReferenceStream.capacity() < (fReferenceCount + (UTF16_SEGMENT_SIZE / 3 + 1)))
194        {
195            fReferenceStream.resizeToFit(fReferenceCount, fReferenceStream.capacity() * 2);
196        }     
197    }
198
199    toFill.fMarkupCount = fMarkupCount;
200
201    fContentIdx = 0;
202    fStringEndPtr = &fStringEndStream[0];   
203
204
205
206
207    XMLWellFormednessParser<XMLScannerType> wfScanner(*this, adapter->getSymbolTable(), adapter->getReferenceTable(), fScanner);
208    wfScanner.checkEntityWellformedness();
209
210    toFill.fElementCount = wfScanner.fElementCount;
211    toFill.fAttributeCount = wfScanner.fAttributeCount;
212    toFill.fProcessingInstructionCount = wfScanner.fProcessingInstructionCount;
213    toFill.fCommentCount = wfScanner.fCommentCount;
214    toFill.fCDATACount = wfScanner.fCDATACount;
215    toFill.fSymbolCount = wfScanner.fSymbolCount;
216    toFill.fReferenceCount = fReferenceCount;
217    toFill.fStringEndCount = wfScanner.fStringCount;
218}
219
220// -------------------------------------------------------------------------------------------
221
222/**
223Scan the prolog portion of the document, which includes everything before the root element
224including the DTD subsets. Returns true if it successfully found the end of the prolog.
225(i.e., no more data is required to complete it.)
226**/
227template<class XMLScannerType>
228bool
229XMLParserImpl<XMLScannerType>::
230scanPrologPage()
231{
232    DEBUG_MESSAGE(" ---------------------------- SCANNING PROLOG -------------------------------");
233
234    checkWellformedness<XMLParser::Prolog>();
235
236    return false;
237}
238
239// -------------------------------------------------------------------------------------------
240
241template<class XMLScannerType>
242bool
243XMLParserImpl<XMLScannerType>::
244buildElementPage()
245{
246    checkWellformedness<XMLParser::Element>();
247
248    resolveDocumentPageNamespaces();
249
250    validateGrammar();   
251
252    if (fDocumentDesseminator)
253    {
254        fDocumentDesseminator->reset(this, fScanner.getPSVIModel(), fScanner.getRootElemName());
255        return true;
256    }
257    else
258    {
259        return false;
260    }
261}
262
263// -------------------------------------------------------------------------------------------
264
265template<class XMLScannerType>
266bool
267XMLParserImpl<XMLScannerType>::
268scanElementPage()
269{     
270    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_DD);
271
272    DEBUG_MESSAGE("######################################################################");
273    DEBUG_MESSAGE("PRINT ELEMENT PAGE:");
274    DEBUG_MESSAGE("######################################################################");
275    if (fDocumentDesseminator)
276    {
277        fDocumentDesseminator->scanPage();
278    }
279
280    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_DD);
281
282    return inElement();
283}
284
285// -------------------------------------------------------------------------------------------
286
287/** This function automatically sends the next piece of content / markup to the
288    appropriate scanner event handler. If it cannot complete a particular piece
289    of content or markup, it returns false.
290 **/
291
292template<class XMLScannerType>
293bool XMLParserImpl<XMLScannerType>::scanNext()
294{
295    return (fDocumentDesseminator) ? fDocumentDesseminator->scanNext() : false;
296}
297
298template<class XMLScannerType>
299bool XMLParserImpl<XMLScannerType>::inElement()
300{
301    return fInElement;
302}
303
304// -------------------------------------------------------------------------------------------
305
306template<class XMLScannerType>
307bool XMLParserImpl<XMLScannerType>::scanMiscellaneousPage()
308{
309    checkWellformedness<XMLParser::Miscellaneous>();
310    if (unlikely(fDocumentObjectCount && fDocumentDesseminator))
311    {
312        fDocumentDesseminator->reset(this, 0, 0);
313        fDocumentDesseminator->scanPage();
314    }
315    return !fNoMore;
316}
317
318// ---------------------------------------------------------------------------------------------------------
319
320template<class XMLScannerType>
321void XMLParserImpl<XMLScannerType>::verifyProlog()
322{
323    // verify that the first content "string" in the element is empty and discard it
324    ContentPtrType contentPtr = &fContentStream[fContentIdx];
325    if (likely(contentPtr == *fStringEndPtr))
326    {
327        contentPtr = *fStringEndPtr++ + 1;
328        fContentIdx++;
329        fMarkupCount--;
330        fInMarkup = true;
331    }
332    else
333    {
334        fScanner.emitError(XMLErrs::ExpectedCommentOrPI);
335    }
336
337    // make sure we start the Element with a legal tag.
338    switch (*contentPtr & MarkupMask)
339    {
340        case StartTagWithAttributes:
341        case StartTagWithoutAttributes:
342        case ProcessingInstruction:
343        case Comment:
344            break;
345        /// ------------------------------------------------------------------------ ///
346        case EndTag:
347            fScanner.emitError(XMLErrs::MoreEndThanStartTags);
348            break;
349        /// ------------------------------------------------------------------------ ///
350        case CDATA:
351            fScanner.emitError(XMLErrs::CDATAOutsideOfContent);
352    }
353}
354
355// ---------------------------------------------------------------------------------------------------------
356
357template<class XMLScannerType>
358template<XMLParser::DocumentStateType DocStateType>
359void XMLParserImpl<XMLScannerType>::checkWellformedness()
360{ 
361    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_WF);
362
363    if (DocStateType == XMLParser::Element)
364    {
365        if (unlikely(fScope == 0 && !fInMarkup))
366        {
367            verifyProlog();
368        }
369
370        XMLWellFormednessParser<XMLScannerType> wfScanner(*this, fSymbolTable, fReferenceTable, fScanner);
371
372        wfScanner.checkWellformedness<XMLParser::Element>(NULL);
373
374        fDocumentAccumulator.init
375        (
376            wfScanner.fElementCount
377            , wfScanner.fAttributeCount
378            , fReferenceCount
379            , wfScanner.fCommentCount
380            , wfScanner.fProcessingInstructionCount
381            , wfScanner.fCDATACount
382            , (fScanner.getPSVIHandler() != NULL)
383        );
384
385        // based on the WF check, also pre-expand any streams as needed.
386        const size_t elementCount = (fElementIndex + wfScanner.fElementCount + 1);
387        if (unlikely(fElement.capacity() <= elementCount))
388        {
389            fElement.resizeToFit(fElementIndex + 1, max(elementCount, fElement.capacity() * 2));
390            DEBUG_MESSAGE(" -- resizing fElement to fit " << fElement.capacity())
391        }
392
393        if (unlikely(fNamespaceResolver.getMaxScope() <= fMaxScope))
394        {
395            fNamespaceResolver.setMaxScope(fMaxScope);
396            fChildren.resizeToFit(fScope, fNamespaceResolver.getMaxScope());
397            fContentFlag.resizeToFit(fScope, fMaxScope);
398            DEBUG_MESSAGE(" -- resizing scopes to fit " << fMaxScope << " -> " << fNamespaceResolver.getMaxScope() << ',' << fChildren.capacity() << ',' << fGidStack.capacity() << ',' << fContentFlag.capacity())
399        }
400
401        if (unlikely(fUriStream.capacity() <= fUriCount))
402        {
403            fUriStream.resizeToFit(0, fUriCount);
404            DEBUG_MESSAGE(" -- resizing fUriStream to fit " << fUriCount << " -> " << fUriStream.capacity())
405        }
406
407        const size_t contextCount = (wfScanner.fElementCount * 2);
408        if (unlikely(fNamespaceContextStream.capacity() <= contextCount))
409        {
410            fNamespaceContextStream.resizeToFit(0, contextCount);
411            DEBUG_MESSAGE(" -- resizing fNamespaceContextStream to fit " << contextCount << " -> " << fNamespaceContextStream.capacity())
412        }
413    }
414    else
415    {
416        fDocumentAccumulator.reset();
417
418        XMLWellFormednessParser<XMLScannerType> wfScanner(*this, fSymbolTable, fReferenceTable, fScanner);
419
420        wfScanner.checkWellformedness<DocStateType>(&fDocumentAccumulator);
421    }
422
423    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_WF);
424}
425
426// -------------------------------------------------------------------------------------------
427
428template<class XMLScannerType>
429void XMLParserImpl<XMLScannerType>::resolveDocumentPageNamespaces()
430{
431    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_NR);
432
433    XMLSchemaLoader<XMLScannerType> schemaLoader(NULL); // fMemoryManager
434    XMLNamespaceParser<XMLScannerType> parser(*this, fNamespaceResolver, fSymbolTable, fReferenceTable, schemaLoader, fScanner);
435
436    const bool isRoot = (fNamespaceResolver.getScope() == 0);
437
438    parser.resolveNamespaces();
439
440    if (fScanner.getDoSchema())
441    {
442        if (unlikely(schemaLoader.hasSchemas()))
443        {
444            schemaLoader.loadAllSchemas(fScanner);
445        }
446        if (unlikely(isRoot))
447        {
448            fScanner.loadExternalGrammars();
449        }
450    }
451
452    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_NR);
453}
454
455// ---------------------------------------------------------------------------------------------------------
456
457template<class XMLScannerType>
458void XMLParserImpl<XMLScannerType>::validateGrammar()
459{
460    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_GV);
461
462    XMLGrammarValidator<XMLScannerType> gv(*this, fNamespaceResolver, fSymbolTable, fReferenceTable, fScanner, fDocumentAccumulator);
463    fInElement = gv.validateGrammar();
464
465    fElementIndex = gv.fElementIndex;
466    fInMarkup = gv.fInMarkup;
467    fElementCount = gv.fElementCount;
468    fContentIdx = gv.fCursorPtr - gv.fContentStream;
469    fSymbolIdx = gv.fSymbolPtr - gv.fSymbolStream;
470    fReferenceIdx = gv.fReferencePtr - gv.fReferenceStream;
471    fStringEndPtr = gv.fStringEndPtr;
472    fMarkupCount = gv.fMarkupCount;
473    //fLine = gv.fLine;
474    //fColumn = gv.fColumn;
475
476    if (gv.fHasIdentityConstraints)
477    {
478        DEBUG_MESSAGE("######################################################################");
479        DEBUG_MESSAGE("IDENTITY CONSTRAINT VALIDATION:");
480        DEBUG_MESSAGE("######################################################################");
481
482        XMLIdentityConstraintValidator icValidator
483        (
484            fDocumentContextStream
485            , fDocumentObjectStream
486            , fNamespaceContextStream
487            , fNamespaceResolver
488            , fDocumentObjectCount
489            , fScanner.getIdentityConstraintHandler()
490            , fScanner.fValidationContext
491            , fScanner.getMemoryManager()
492        );
493
494        fHasIdentityConstraints = (icValidator.validateIdentityConstraints(gv.fMaxAttributeCount) > 0);
495    }
496
497    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_GV);
498}
499
500// ---------------------------------------------------------------------------------------------------------
501
502template<class XMLScannerType>
503void XMLParserImpl<XMLScannerType>::prepareForNextDocumentPage()
504{
505    const XMLCh * cursorPtr = &fContentStream[fContentIdx];
506    const ptrdiff_t stringIndex = (fStringEndPtr - fStringEndStream.first());
507
508    DEBUG_MESSAGE(" -- contentIdx=" << fContentIdx << " of " << (fCursorEndPtr - fContentStream.first()))
509    DEBUG_MESSAGE(" -- stringIndex=" << stringIndex << " of " << fStringCount)
510    DEBUG_MESSAGE(" -- fSymbolIdx=" << fSymbolIdx << " of " << fSymbolCount)
511    DEBUG_MESSAGE(" -- fReferenceIdx=" << fReferenceIdx << " of " << fReferenceCount)
512
513        /** ----------------------- BUFFER CONTENT STREAM --------------------------- **/
514
515        // stringEndOffset is required to adjust the string end position of the unused strings
516    ptrdiff_t cursorOffset = (fContentIdx);
517
518    if (likely(cursorPtr < fCursorEndPtr))
519        {
520        const XMLCh * const contentBuf0 = &fContentStream[0];
521                const unsigned int unusedLength =
522            fContentStream.copyToFront(cursorOffset, (fCursorEndPtr - &fContentStream[0]));
523                // just incase the content buffer was expanded, adjust the string end offset
524        cursorOffset += (contentBuf0 - &fContentStream[0]);
525
526        fContentIdx = unusedLength;
527        }
528        else
529        {
530        fContentIdx = 0;
531        }
532
533        /** -------------------- MOVE STRING END POINTERS BACK ------------------------ **/
534
535        if (stringIndex < fStringCount)
536        {
537        fStringEndStream.adjust(stringIndex, fStringCount, cursorOffset);
538        fStringCount = fStringEndStream.copyToFront(stringIndex, fStringCount);
539        }
540        else
541        {
542                fStringCount = 0;
543        }
544
545        /** ------------------------ BUFFER SYMBOL STREAM ------------------------------ **/
546
547    fSymbolCount = fSymbolStream.copyToFront(fSymbolIdx, fSymbolCount);
548
549    /** ----------------------- BUFFER REFERENCE STREAM ----------------------------- **/
550
551    fReferenceCount = fReferenceStream.copyToFront(fReferenceIdx, fReferenceCount);
552}
553
554XERCES_CPP_NAMESPACE_END
Note: See TracBrowser for help on using the repository browser.