source: icXML/icXML-0.95/src/icxmlc/XMLParserImpl.c @ 3602

Last change on this file since 3602 was 3602, checked in by cameron, 5 years ago

Namespace bug fix for icXML-0.95

File size: 17.8 KB
Line 
1/*
2 *  Copyright © 2012 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icXML is a trademark of International Characters.
5 */
6
7/*
8 * @author Nigel Medforth, nigelm -at- interational-characters.com
9 * @version $Id: XMLParserImpl.c 377 2013-09-23 23:13:04Z nigelm $
10 *
11 */
12
13#include <icxmlc/XMLParserImpl.hpp>
14#include <icxercesc/util/TransService.hpp>
15#include <icxmlc/XMLUTF16CharacterSetAdapter.hpp>
16#include <icxmlc/parsers/XMLSchemaLoader.hpp>
17#include <icxmlc/parsers/XMLNamespaceParser.hpp>
18#include <icxmlc/parsers/XMLWellFormednessParser.hpp>
19#include <icxmlc/parsers/XMLGrammarValidator.hpp>
20#include <icxmlc/parsers/XMLIdentityConstraintValidator.hpp>
21
22XERCES_CPP_NAMESPACE_BEGIN
23
24// ---------------------------------------------------------------------------------------------------------
25
26template<class XMLScannerType>
27void
28XMLParserImpl<XMLScannerType>::
29init
30(
31        XMLCharacterSetAdapter *        adapter
32        , XMLTranscoder *                       transcoder
33        , const XMLFileLoc                      line
34        , const XMLFileLoc                      column
35    , const XMLVersion          /* version */
36)
37{
38        // reset the curr and next line col trackers to the actual position
39        fCharacterSetAdapter = adapter;
40        fSymbolTable.setTranscoder(transcoder);
41    fReferenceTable.setTranscoder(transcoder);
42    adapter->init(&fScanner, fSymbolTable, fReferenceTable, line, column);
43}
44
45// ---------------------------------------------------------------------------------------------------------
46
47template<class XMLScannerType>
48void
49XMLParserImpl<XMLScannerType>::
50preScanDocumentPage
51(
52    XMLByte                                     *       input
53    , const XMLSize_t                   avail
54        , const bool                            noMore
55        , XMLReader                             *       reader
56)
57{
58        /** ---------------------------------- BEGIN PARALLEL XML PARSING ---------------------------------- **/
59
60    DEBUG_MESSAGE("######################################################################");
61    DEBUG_MESSAGE("BEGIN XML PARSING SCAN: avail=" << avail << " noMore=" << noMore);
62    DEBUG_MESSAGE("######################################################################");
63
64    fNoMore = noMore;
65
66    size_t bytesEaten;
67
68    fCursorEndPtr = &fContentStream[fContentIdx];
69
70    fCharacterSetAdapter->parse
71    (
72        input
73        , avail
74        , noMore
75        , *this
76        , bytesEaten
77    );
78
79        fSymbolIdx = 0;
80    fReferenceIdx = 0;
81    fStringEndPtr = &fStringEndStream[0];
82    fContentIdx = 0;
83
84    if (unlikely(fReferenceCount > 0 && !fEntityContentBuffer))
85    {
86        fEntityContentBuffer = new (fScanner.getMemoryManager()) XMLBuffer(1023, fScanner.getMemoryManager());
87    }
88
89        /** ----------------------------------- BUFFER SYMBOL STREAM ----------------------------------- **/
90
91    // this could be threaded; would need a lock above the parallel xml parsing section.
92
93        if (likely(!noMore))
94        {
95        // what's the earliest symbol/ref position that we need to copy back? all data after this
96        // point must be copied back and will not be modified or inspected when
97                // parsing the next document page.
98        reader->refreshRawBuffer(bytesEaten, 0);
99        }
100
101}
102
103// -------------------------------------------------------------------------------------------
104
105template<class XMLScannerType>
106void
107XMLParserImpl<XMLScannerType>::
108scanInternalDocumentPage
109(
110    const XMLCh *                       input
111    , const XMLSize_t                                   length
112    , XMLReplacementText &              toFill
113)
114{
115    enum { UTF16_SEGMENT_SIZE = ((SEGMENT_SIZE) / sizeof(XMLCh)) };
116
117    size_t offset = 0;
118
119    fSymbolCount = 0;
120    fReferenceCount = 0;
121    fStringCount = 0;
122    fMarkupCount = 0;
123    fCursorEndPtr = &fContentStream[0];
124
125    XMLUTF16CharacterSetAdapter * adapter =
126        reinterpret_cast<XMLUTF16CharacterSetAdapter*>(fCharacterSetAdapter);
127
128
129    // TODO: potential line column tracking issue: will this handle extremely long sequences of replacement text?
130
131    for (;;)
132    {
133        size_t avail = length - offset;
134        fNoMore = 1;
135        if (unlikely(avail > UTF16_SEGMENT_SIZE))
136        {
137            avail = UTF16_SEGMENT_SIZE;
138            fNoMore = 0;
139        }
140
141        DEBUG_MESSAGE(" **** SCAN INTERNAL PAGE @ " << offset << " -> avail=" << avail << " noMore=" << fNoMore)
142
143        size_t bytesEaten = 0;
144
145        adapter->parseInternal
146        (
147            &input[offset]
148            , avail
149            , fNoMore
150            , *this
151            , bytesEaten
152        );
153
154        if (fNoMore) break;
155
156        offset += bytesEaten;
157
158        DEBUG_MESSAGE(" **** NEXT INTERNAL PAGE @ " << offset << "! ****")
159
160        /** ----------------------- BUFFER CONTENT STREAM --------------------------- **/
161
162        // stringEndOffset is required to adjust the string end position of the unused strings
163        ptrdiff_t cursorOffset = (fCursorEndPtr - &fContentStream[0]);
164
165        if (fContentStream.capacity() < (cursorOffset + UTF16_SEGMENT_SIZE))
166        {
167            const XMLCh * const contentBuf0 = &fContentStream[0];
168
169            fContentStream.resizeToFit(cursorOffset, fContentStream.capacity() * 2);
170
171            fCursorEndPtr = &fContentStream[cursorOffset];
172
173            // just incase the content buffer was expanded, adjust the string end offset
174            fStringEndStream.adjust(0, fStringCount, contentBuf0 - &fContentStream[0]);
175        }
176
177        /** -------------------- MOVE STRING END POINTERS BACK ------------------------ **/
178
179        if (fStringEndStream.capacity() < (fStringCount + (UTF16_SEGMENT_SIZE / 4)))
180        {
181            fStringEndStream.resizeToFit(fStringCount, fStringEndStream.capacity() * 2);
182        }
183
184        /** ------------------------ BUFFER SYMBOL STREAM ------------------------------ **/
185
186        if (fSymbolStream.capacity() < (fSymbolCount + (UTF16_SEGMENT_SIZE / 2)))
187        {
188            fSymbolStream.resizeToFit(fSymbolCount, fSymbolStream.capacity() * 2);
189        }
190
191        /** ----------------------- BUFFER REFERENCE STREAM ---------------------------- **/
192
193        if (fReferenceStream.capacity() < (fReferenceCount + (UTF16_SEGMENT_SIZE / 3 + 1)))
194        {
195            fReferenceStream.resizeToFit(fReferenceCount, fReferenceStream.capacity() * 2);
196        }     
197    }
198
199    toFill.fMarkupCount = fMarkupCount;
200
201    fContentIdx = 0;
202    fStringEndPtr = &fStringEndStream[0];   
203
204
205
206
207    XMLWellFormednessParser<XMLScannerType> wfScanner(*this, adapter->getSymbolTable(), adapter->getReferenceTable(), fScanner);
208    wfScanner.checkEntityWellformedness();
209
210    toFill.fElementCount = wfScanner.fElementCount;
211    toFill.fAttributeCount = wfScanner.fAttributeCount;
212    toFill.fProcessingInstructionCount = wfScanner.fProcessingInstructionCount;
213    toFill.fCommentCount = wfScanner.fCommentCount;
214    toFill.fCDATACount = wfScanner.fCDATACount;
215    toFill.fSymbolCount = wfScanner.fSymbolCount;
216    toFill.fReferenceCount = fReferenceCount;
217    toFill.fStringEndCount = wfScanner.fStringCount;
218}
219
220// -------------------------------------------------------------------------------------------
221
222/**
223Scan the prolog portion of the document, which includes everything before the root element
224including the DTD subsets. Returns true if it successfully found the end of the prolog.
225(i.e., no more data is required to complete it.)
226**/
227template<class XMLScannerType>
228bool
229XMLParserImpl<XMLScannerType>::
230scanPrologPage()
231{
232    DEBUG_MESSAGE(" ---------------------------- SCANNING PROLOG -------------------------------");
233
234    checkWellformedness<XMLParser::Prolog>();
235
236    return false;
237}
238
239// -------------------------------------------------------------------------------------------
240
241template<class XMLScannerType>
242bool
243XMLParserImpl<XMLScannerType>::
244buildElementPage()
245{
246    checkWellformedness<XMLParser::Element>();
247
248    resolveDocumentPageNamespaces();
249
250    validateGrammar();   
251
252    if (fDocumentDesseminator)
253    {
254        fDocumentDesseminator->reset(this, fScanner.getPSVIModel(), fScanner.getRootElemName());
255        return true;
256    }
257    else
258    {
259        return false;
260    }
261}
262
263// -------------------------------------------------------------------------------------------
264
265template<class XMLScannerType>
266bool
267XMLParserImpl<XMLScannerType>::
268scanElementPage()
269{
270    assert (fDocumentDesseminator);
271
272    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_DD);
273
274    DEBUG_MESSAGE("######################################################################");
275    DEBUG_MESSAGE("PRINT ELEMENT PAGE:");
276    DEBUG_MESSAGE("######################################################################");
277
278    try
279    {
280        if (fDocumentDesseminator)
281        {
282            fDocumentDesseminator->scanPage();
283        }
284    }
285    catch (const std::exception & ex)
286    {
287        std::cerr << "Error scanning element page! " << ex.what() << std::endl;
288        exit(-1);
289    }
290
291    DEBUG_MESSAGE(" -- done scanning element page.")
292
293    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_DD);
294
295    return inElement();
296}
297
298// -------------------------------------------------------------------------------------------
299
300/** This function automatically sends the next piece of content / markup to the
301    appropriate scanner event handler. If it cannot complete a particular piece
302    of content or markup, it returns false.
303 **/
304
305template<class XMLScannerType>
306bool XMLParserImpl<XMLScannerType>::scanNext()
307{
308    return (fDocumentDesseminator) ? fDocumentDesseminator->scanNext() : false;
309}
310
311template<class XMLScannerType>
312bool XMLParserImpl<XMLScannerType>::inElement()
313{
314    return fInElement;
315}
316
317// -------------------------------------------------------------------------------------------
318
319template<class XMLScannerType>
320bool XMLParserImpl<XMLScannerType>::scanMiscellaneousPage()
321{
322    checkWellformedness<XMLParser::Miscellaneous>();
323    if (unlikely(fDocumentObjectCount && fDocumentDesseminator))
324    {
325        fDocumentDesseminator->reset(this, 0, 0);
326        fDocumentDesseminator->scanPage();
327    }
328    return !fNoMore;
329}
330
331// ---------------------------------------------------------------------------------------------------------
332
333template<class XMLScannerType>
334void XMLParserImpl<XMLScannerType>::verifyProlog()
335{
336    // verify that the first content "string" in the element is empty and discard it
337    ContentPtrType contentPtr = &fContentStream[fContentIdx];
338    if (likely(contentPtr == *fStringEndPtr))
339    {
340        contentPtr = *fStringEndPtr++ + 1;
341        fContentIdx++;
342        fMarkupCount--;
343        fInMarkup = true;
344    }
345    else
346    {
347        fScanner.emitError(XMLErrs::ExpectedCommentOrPI);
348    }
349
350    // make sure we start the Element with a legal tag.
351    switch (*contentPtr & MarkupMask)
352    {
353        case StartTagWithAttributes:
354        case StartTagWithoutAttributes:
355        case ProcessingInstruction:
356        case Comment:
357            break;
358        /// ------------------------------------------------------------------------ ///
359        case EndTag:
360            fScanner.emitError(XMLErrs::MoreEndThanStartTags);
361            break;
362        /// ------------------------------------------------------------------------ ///
363        case CDATA:
364            fScanner.emitError(XMLErrs::CDATAOutsideOfContent);
365    }
366}
367
368// ---------------------------------------------------------------------------------------------------------
369
370template<class XMLScannerType>
371template<XMLParser::DocumentStateType DocStateType>
372void XMLParserImpl<XMLScannerType>::checkWellformedness()
373{ 
374    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_WF);
375
376    if (DocStateType == XMLParser::Element)
377    {
378        if (unlikely(fScope == 0 && !fInMarkup))
379        {
380            verifyProlog();
381        }
382
383        XMLWellFormednessParser<XMLScannerType> wfScanner(*this, fSymbolTable, fReferenceTable, fScanner);
384
385        wfScanner.checkWellformedness<XMLParser::Element>(NULL);
386
387        fDocumentAccumulator.init
388        (
389            wfScanner.fElementCount
390            , wfScanner.fAttributeCount
391            , fReferenceCount
392            , wfScanner.fCommentCount
393            , wfScanner.fProcessingInstructionCount
394            , wfScanner.fCDATACount
395            , (fScanner.getPSVIHandler() != NULL)
396        );
397
398        // based on the WF check, also pre-expand any streams as needed.
399        const size_t elementCount = (fElementIndex + wfScanner.fElementCount + 1);
400        if (unlikely(fElement.capacity() <= elementCount))
401        {
402            fElement.resizeToFit(fElementIndex + 1, max(elementCount, fElement.capacity() * 2));
403            DEBUG_MESSAGE(" -- resizing fElement to fit " << fElement.capacity())
404        }
405
406        fNamespaceResolver.setMaxScope(fMaxScope + 1);
407        fChildren.resizeToFit(fScope, fNamespaceResolver.getMaxScope());
408        fContentFlag.resizeToFit(fScope, fNamespaceResolver.getMaxScope());
409
410        if (unlikely(fUriStream.capacity() <= fUriCount))
411        {
412            fUriStream.resizeToFit(0, fUriCount);
413            DEBUG_MESSAGE(" -- resizing fUriStream to fit " << fUriCount << " -> " << fUriStream.capacity())
414        }
415
416        const size_t contextCount = (wfScanner.fElementCount * 2);
417        if (unlikely(fNamespaceContextStream.capacity() <= contextCount))
418        {
419            fNamespaceContextStream.resizeToFit(0, contextCount);
420            DEBUG_MESSAGE(" -- resizing fNamespaceContextStream to fit " << contextCount << " -> " << fNamespaceContextStream.capacity())
421        }
422    }
423    else
424    {
425        fDocumentAccumulator.reset();
426
427        XMLWellFormednessParser<XMLScannerType> wfScanner(*this, fSymbolTable, fReferenceTable, fScanner);
428
429        wfScanner.checkWellformedness<DocStateType>(&fDocumentAccumulator);
430    }
431
432    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_WF);
433}
434
435// -------------------------------------------------------------------------------------------
436
437template<class XMLScannerType>
438void XMLParserImpl<XMLScannerType>::resolveDocumentPageNamespaces()
439{
440    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_NR);
441
442    XMLSchemaLoader<XMLScannerType> schemaLoader(NULL); // fMemoryManager
443    XMLNamespaceParser<XMLScannerType> parser(*this, fNamespaceResolver, fSymbolTable, fReferenceTable, schemaLoader, fScanner);
444
445    const bool isRoot = (fNamespaceResolver.getScope() == 0);
446
447    parser.resolveNamespaces();
448
449    if (fScanner.getDoSchema())
450    {
451        if (unlikely(schemaLoader.hasSchemas()))
452        {
453            schemaLoader.loadAllSchemas(fScanner);
454        }
455        if (unlikely(isRoot))
456        {
457            fScanner.loadExternalGrammars();
458        }
459    }
460
461    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_NR);
462}
463
464// ---------------------------------------------------------------------------------------------------------
465
466template<class XMLScannerType>
467void XMLParserImpl<XMLScannerType>::validateGrammar()
468{
469    START_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_GV);
470
471    XMLGrammarValidator<XMLScannerType> gv(*this, fNamespaceResolver, fSymbolTable, fReferenceTable, fScanner, fDocumentAccumulator);
472    fInElement = gv.validateGrammar();
473
474    fElementIndex = gv.fElementIndex;
475    fInMarkup = gv.fInMarkup;
476    fElementCount = gv.fElementCount;
477    fContentIdx = gv.fCursorPtr - gv.fContentStream;
478    fSymbolIdx = gv.fSymbolPtr - gv.fSymbolStream;
479    fReferenceIdx = gv.fReferencePtr - gv.fReferenceStream;
480    fStringEndPtr = gv.fStringEndPtr;
481    fMarkupCount = gv.fMarkupCount;
482    //fLine = gv.fLine;
483    //fColumn = gv.fColumn;
484
485    if (gv.fHasIdentityConstraints)
486    {
487        DEBUG_MESSAGE("######################################################################");
488        DEBUG_MESSAGE("IDENTITY CONSTRAINT VALIDATION:");
489        DEBUG_MESSAGE("######################################################################");
490
491        XMLIdentityConstraintValidator icValidator
492        (
493            fDocumentContextStream
494            , fDocumentObjectStream
495            , fNamespaceContextStream
496            , fNamespaceResolver
497            , fDocumentObjectCount
498            , fScanner.getIdentityConstraintHandler()
499            , fScanner.fValidationContext
500            , fScanner.getMemoryManager()
501        );
502
503        fHasIdentityConstraints = (icValidator.validateIdentityConstraints(gv.fMaxAttributeCount) > 0);
504    }
505
506    STOP_PAPI_COUNTER(fPapiCounter, PAPI_MEASURE_GV);
507}
508
509// ---------------------------------------------------------------------------------------------------------
510
511template<class XMLScannerType>
512void XMLParserImpl<XMLScannerType>::prepareForNextDocumentPage()
513{
514    const XMLCh * cursorPtr = &fContentStream[fContentIdx];
515    const ptrdiff_t stringIndex = (fStringEndPtr - fStringEndStream.first());
516
517    DEBUG_MESSAGE(" -- contentIdx=" << fContentIdx << " of " << (fCursorEndPtr - fContentStream.first()))
518    DEBUG_MESSAGE(" -- stringIndex=" << stringIndex << " of " << fStringCount)
519    DEBUG_MESSAGE(" -- fSymbolIdx=" << fSymbolIdx << " of " << fSymbolCount)
520    DEBUG_MESSAGE(" -- fReferenceIdx=" << fReferenceIdx << " of " << fReferenceCount)
521
522        /** ----------------------- BUFFER CONTENT STREAM --------------------------- **/
523
524        // stringEndOffset is required to adjust the string end position of the unused strings
525    ptrdiff_t cursorOffset = (fContentIdx);
526
527    if (likely(cursorPtr < fCursorEndPtr))
528        {
529        const XMLCh * const contentBuf0 = &fContentStream[0];
530                const unsigned int unusedLength =
531            fContentStream.copyToFront(cursorOffset, (fCursorEndPtr - &fContentStream[0]));
532                // just incase the content buffer was expanded, adjust the string end offset
533        cursorOffset += (contentBuf0 - &fContentStream[0]);
534
535        fContentIdx = unusedLength;
536        }
537        else
538        {
539        fContentIdx = 0;
540        }
541
542        /** -------------------- MOVE STRING END POINTERS BACK ------------------------ **/
543
544        if (stringIndex < fStringCount)
545        {
546        fStringEndStream.adjust(stringIndex, fStringCount, cursorOffset);
547        fStringCount = fStringEndStream.copyToFront(stringIndex, fStringCount);
548        }
549        else
550        {
551                fStringCount = 0;
552        }
553
554        /** ------------------------ BUFFER SYMBOL STREAM ------------------------------ **/
555
556    fSymbolCount = fSymbolStream.copyToFront(fSymbolIdx, fSymbolCount);
557
558    /** ----------------------- BUFFER REFERENCE STREAM ----------------------------- **/
559
560    fReferenceCount = fReferenceStream.copyToFront(fReferenceIdx, fReferenceCount);
561}
562
563XERCES_CPP_NAMESPACE_END
Note: See TracBrowser for help on using the repository browser.