source: trunk/markup_stats.cxx @ 120

Last change on this file since 120 was 120, checked in by cameron, 11 years ago

Error reporting for character validation errors.

File size: 10.6 KB
Line 
1/*  markup_stats.c - parabix demo program
2    Copyright (c) 2007, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12#include <errno.h>
13#include <sys/types.h>
14#include <sys/stat.h>
15#include <string>
16#include <iostream>
17using namespace std;
18
19#ifdef PAPI
20#include "../code_clocker/clocker/cc.h"
21#include "../code_clocker/clocker/cc.cxx"
22
23CC * c;
24#endif
25
26//#include "src/ilax.h"
27#include "src/engine.h"
28
29#ifndef REPEAT_RUNS
30#define REPEAT_RUNS 1
31#endif
32
33/* Internals */
34#include "src/xmlmodel.h"
35#include "src/xml_error.h"
36#include "src/bitplex.h"
37#include "src/byteplex.h"
38#include "src/xmldecl.h"
39#include "src/bitlex.h"
40
41
42#include "src/xmlmodel.c"
43#include "src/xml_error.c"
44#include "src/bitplex.c"
45#include "src/byteplex.c"
46#include "src/xmldecl.c"
47#include "src/bitlex.c"
48#include "src/engine.c"
49
50/* Global declarations of parsing engine. */
51Parser_Interface * parser;
52
53/* Global declarations for statistics. */
54
55int comment_count = 0;
56int comment_length = 0;
57int CDATA_start_count = 0;
58int CDATA_start_pos = 0;
59int CDATA_length = 0;
60int CDATA_end_count = 0;
61int PI_count = 0;
62int PI_length = 0;
63int empty_elem_count = 0;
64int empty_elem_length = 0;
65int start_tag_count = 0;
66int start_tag_length = 0;
67int attribute_count = 0;
68int end_tag_count = 0;
69int end_tag_length = 0;
70int reference_count = 0;
71int reference_length = 0;
72int text_item_count = 0;
73int text_item_length = 0;
74int error_item_count = 0;
75int error_item_length = 0;
76int nesting_depth = 0;
77int max_nesting_depth = 0;
78int total_attribute_count = 0;
79int total_att_name_length = 0;
80int total_att_value_length = 0;
81int namespace_count = 0;
82int total_namespace_name_length = 0;
83int total_namespace_URI_length = 0;
84
85int last_item_start = 0;
86int last_item_stop = 0;
87int last_buffer_rel_pos = 0;
88
89
90/* Action routine for an XML comment in "<!--"  "-->" brackets. */
91template <CodeUnit_Base C>
92inline void ParsingEngine<C>::Comment_action(unsigned char * item, int lgth) {
93        comment_count +=1;
94        comment_length += lgth;
95}
96
97/* Action routine called upon recognizing "<![CDATA[" to start a CDATA section. */
98template <CodeUnit_Base C>
99inline void ParsingEngine<C>::CDATA_start_action(unsigned char * CDATA_ptr){
100        CDATA_start_pos = AbsPos() - 9;
101        CDATA_start_count +=1;
102}
103
104/* Action routine called upon recognizing "]]>" to end a CDATA section. */
105template <CodeUnit_Base C>
106inline void ParsingEngine<C>::CDATA_end_action(unsigned char * CDATA_end_ptr) {
107        CDATA_end_count +=1;
108        CDATA_length += AbsPos() - CDATA_start_pos;
109}
110
111/* Action routine for an XML processing instruction enclosed in "<?" and "?>" brackets. */
112template <CodeUnit_Base C>
113inline void ParsingEngine<C>::PI_action(unsigned char * item, int lgth) {
114        PI_count +=1;
115        PI_length += lgth;
116}
117
118/* Action routine for an empty element enclosed in "<" and "/>" brackets. */
119template <CodeUnit_Base C>
120inline void ParsingEngine<C>::EmptyElement_action(unsigned char * item, int lgth) {
121        empty_elem_count +=1;
122        empty_elem_length += lgth;
123}
124
125/* Action routine for a start tag enclosed in "<" and ">" brackets. */
126template <CodeUnit_Base C>
127inline void ParsingEngine<C>::StartTag_action(unsigned char * item, int lgth) {
128        start_tag_count +=1;
129        start_tag_length += lgth;
130        nesting_depth += 1;
131        if (nesting_depth > max_nesting_depth) max_nesting_depth = nesting_depth;
132//      cout << string((char *) item, lgth) << endl;
133}
134
135/* Action routine for an end tag enclosed in "</" and ">" brackets. */
136template <CodeUnit_Base C>
137inline void ParsingEngine<C>::EndTag_action(unsigned char * item, int lgth) {
138        end_tag_count +=1;
139        end_tag_length += lgth;
140        nesting_depth -= 1;
141}
142
143/* Action routine for an error item */
144template <CodeUnit_Base C>
145inline void ParsingEngine<C>::Error_action(unsigned char * item, int lgth) {
146        error_item_count +=1;
147        error_item_length += lgth;
148        printf("Error: illegal markup at positions %i of length %i.\n", AbsPos()-lgth, lgth);
149        cout << string((char *) item, lgth) << endl;
150}
151
152/* Action routine for a text item */
153template <CodeUnit_Base C>
154inline void ParsingEngine<C>::Text_action(unsigned char * item, int lgth) {
155        text_item_count +=1;
156        text_item_length += lgth;
157}
158
159template <CodeUnit_Base C>
160inline void ParsingEngine<C>::Reference_action(unsigned char * item, int lgth) {
161        reference_count +=1;
162        reference_length += lgth;
163}
164
165
166
167
168/* Three action routines for markup components are defined as follows.
169|ElementName_action| is the action routine called upon recognition of
170an element name immediately after the opening angle bracket of a start
171tag or empty element tag.  {\bf OR MAYBE THIS SHOULD BE DEFERRED UNTIL
172AFTER ATTRIBUTE PROCESSING SO THAT NAMESPACES ARE SET?}
173It is called with two parameters identifying the
174first and last character positions of the expected XML_name.
175Similarly, |PI_Target_action| is the action routine called upon recognition
176of the XML Name that occurs immediately after the opening "<?"
177delimiter of a processing instruction.
178
179 The third action routine for markup components is Attribute_Value_action,
180which takes three parameters rather than two.  {\bf OR POSSIBLY JUST
181THE QUOTE MARK ITEMS, RELYING ON THE END OF THE LAST COMPONENT PROCESSED
182TO MARK THE SPACE BEFORE THE ATT NAME.- REQUIRES ELEMENT_NAME_ACTION}
183*/
184
185/* Semantic action routines for markup components. */
186/* Action routine for an element name occurring immediately after the
187   opening "<" of a start tag or empty element tag. */
188template <CodeUnit_Base C>
189inline void ParsingEngine<C>::ElementName_action(unsigned char * item, int lgth) {
190}
191
192/* Action routine for a processing instruction target name occurring immediately
193   after the opening "<?" of a processing instruction. */
194template <CodeUnit_Base C>
195inline void ParsingEngine<C>::PI_Target_action(unsigned char * item, int lgth) {
196}
197
198/* Action routine for an individual attribute/value pair occurring in
199   a element start tag or an empty element tag. */
200template <CodeUnit_Base C>
201inline void ParsingEngine<C>::AttributeValue_action(unsigned char * name, int name_lgth, 
202                                 unsigned char * val, int val_lgth) {
203        total_attribute_count+=1;
204        total_att_name_length += name_lgth;
205        total_att_value_length += val_lgth;
206}
207
208/* Action routine for an individual attribute/value pair occurring in
209   a element start tag or an empty element tag. */
210template <CodeUnit_Base C>
211inline void ParsingEngine<C>::Namespace_action(unsigned char * name, int name_lgth,
212                             unsigned char * URI, int URI_lgth) {
213        namespace_count+=1;
214        total_namespace_name_length += name_lgth;
215        total_namespace_URI_length += URI_lgth;
216}
217
218
219template <CodeUnit_Base C>
220inline void ParsingEngine<C>::FinalizeBuffer_action(int& preserve_pos) {
221#ifdef DEBUG
222        printf ("FinalizeBuffer; last 16 bytes + lookahead 16 =\n");
223        cout << string((char *) GetCodeUnitPtr(AbsPos()-16), 16) << "::" << string((char *) GetCodeUnitPtr(AbsPos()), 16) << endl;
224#endif
225
226        preserve_pos = AbsPos();
227}
228
229
230template <CodeUnit_Base C>
231inline void ParsingEngine<C>::DocumentStart_action() {
232}
233
234template <CodeUnit_Base C>
235inline void ParsingEngine<C>::DocumentEnd_action() {
236}
237
238template <CodeUnit_Base C>
239inline void ParsingEngine<C>::Doctype_action(unsigned char * item, int lgth) {
240#ifdef SHOW_DTD_ACTIONS
241        printf("Document Type:\n");
242        cout << string((char *) item, lgth) <<endl;
243#endif
244}
245
246template <CodeUnit_Base C>
247inline void ParsingEngine<C>::PEReference_action(unsigned char * item, int lgth) {
248}
249
250
251template <CodeUnit_Base C>
252inline void ParsingEngine<C>::ExtSubsetDecl_action(unsigned char * item, int lgth) {
253#ifdef SHOW_DTD_ACTIONS
254        printf("ExtSubsetDecl:\n");
255        cout << string((char *) item, lgth) <<endl;
256#endif
257        printf("Finish parsing ExtSubsetDecl!\n");
258}
259
260template <CodeUnit_Base C>
261inline void ParsingEngine<C>::Prolog_action(unsigned char * item, int lgth) {
262#ifdef SHOW_DTD_ACTIONS
263        printf("Prolog:\n");
264        cout << string((char *) item, lgth) <<endl;
265#endif
266}
267
268#define print_stats(stat_string, count, total_lgth) \
269        printf("%i %s", count, stat_string);\
270        if (count == 0) printf("s.\n");\
271        else if (count == 1) printf(" of length %i.\n", total_lgth);\
272        else printf("s of avg. lgth %i.\n", total_lgth/count);
273
274int
275main(int argc, char * argv[]) {
276        if (argc != 2) {
277        printf("Usage: %s <filename>\n", argv[0]);
278                exit(-1);
279        }
280        char * filename = argv[1];
281
282        #ifdef PAPI
283                #define NUM_EVENTS 2
284                int Events[NUM_EVENTS] = {PAPI_TOT_CYC, PAPI_BR_MSP};
285                int cal_size = 1000;
286                c = new CC(Events,NUM_EVENTS,cal_size);
287                c->cc_set_cmd(argv[0]);
288                c->cc_set_param("Mhz","2127.997");
289        #endif
290
291        for (int run = 0; run < REPEAT_RUNS; run++) {
292        #ifdef PAPI
293                c->cc_start_interval();
294        #endif
295
296        parser = Parser_Interface::ParserFactory(filename);
297       
298        #ifdef PAPI
299                int elems = 0;
300                c->cc_end_interval(100);
301        #endif
302               
303        /*             
304        if (!parser->has_ByteOrderMark()) printf("No ");
305        printf("Byte Order Mark found.\n");
306
307        if (parser->get_version() == XML_1_0) printf("XML version 1.0 declared.\n");
308        else if (parser->get_version() == XML_1_1) printf("XML version 1.1 declared.\n");
309        else printf ("XML version 1.0 implied by default.\n");
310        if (parser->has_EncodingDecl()) {
311                printf("XML encoding declared:  %s\n", parser->get_Encoding());
312        }
313        if (parser->standalone_status() == Standalone_yes)
314                printf("XML standalone = yes declared.\n");
315        else if (parser->standalone_status() == Standalone_no)
316                printf("XML standalone = no declared.\n");
317        else printf ("XML standalone = no by default.\n");
318        */
319       
320        parser->Parse_Prolog();
321
322        //#define VALIDATION
323        #ifdef VALIDATION
324                parser->Parse_DocumentContent();
325        #endif
326       
327        #ifndef VALIDATION
328                parser->ParseContent();
329        #endif
330
331        parser->~Parser_Interface();
332        printf("Run %i complete.\n", run);
333       
334       
335        }
336
337        #ifdef PAPI
338                c->cc_display();
339                c->cc_write_xml_file();
340                c->cc_write_csv_file();
341                delete c;
342        #endif 
343       
344        print_stats("comment", comment_count, comment_length);
345        print_stats("CDATA section", CDATA_end_count, CDATA_length);
346        print_stats("processing instruction", PI_count, PI_length);
347        print_stats("empty element", empty_elem_count, empty_elem_length);
348        print_stats("start tag", start_tag_count, start_tag_length);
349        printf("%i total attributes\n", attribute_count);
350        print_stats("attribute name", total_attribute_count, total_att_name_length);
351        print_stats("attribute value", total_attribute_count, total_att_value_length);
352        print_stats("namespace name", namespace_count, total_namespace_name_length);
353        print_stats("namespace URI", namespace_count, total_namespace_URI_length);
354        print_stats("end tag", end_tag_count, end_tag_length);
355        print_stats("text item", text_item_count, text_item_length);
356        print_stats("reference", reference_count, reference_length);
357        print_stats("error item", error_item_count, error_item_length);
358        printf("Maximum nesting depth = %i\n", max_nesting_depth);
359       
360        return(0);
361}
Note: See TracBrowser for help on using the repository browser.