source: docs/Balisage13/Bal2013came0601/Bal2013came0601.html @ 3896

Last change on this file since 3896 was 3400, checked in by nmedfort, 6 years ago

minor fixes

File size: 104.9 KB
1<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "">
2<html lang="en">
4<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
5<title>icXML:  Accelerating a Commercial XML
6     Parser Using SIMD and Multicore Technologies</title>
7<link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
8<meta name="keywords" content="">
11<div id="balisage-header"><h1 style="text-align: right; font-family: serif; margin:0.25em">
12<i>Balisage:</i> <small>The Markup Conference</small>
14<html lang="en">
16<title>icXML:  Accelerating a Commercial XML
17     Parser Using SIMD and Multicore Technologies</title>
18<link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
19<meta name="generator" content="Balisage Conference Proceedings XSLT (v1.2)">
20<script type="text/javascript">
21    function toggle(folderID) {
22      folder = document.getElementById("folder-"+folderID);
23      icon = document.getElementById("icon-"+folderID)
24      // need to:
25      //   switch between 'none' and 'block'
26      //   switch between collapse and expand icons
27      if ( != "block") {
28 = "block";
29        icon.src = "minus.png" ;
30        icon.alt = "collapse" ;
31      }
32      else {
33 = "none";
34        icon.src = "plus.png" ;
35        icon.alt = "expand" ;
36      };
37      return;
38    }
40   function hidecite(citeID) {
41     cite = document.getElementById(citeID);
42 = "none";
43     return;
44   }
46   function showcite(citeID,anchorID) {
47     cite = document.getElementById(citeID);
49     citeLeft =;
50     citeTop =;
52     if (citeLeft != (getLeft(anchorID)+"px") ||
53         citeTop != (getTop(anchorID)+"px")) {
54 = "none";
55     }
57     if ( != "table-cell") {
58        movebox(citeID, anchorID);
59 = "table-cell";
60     }
61     else {
62 = "none";
63     };
64     return;
65   }
67   function movebox(citeID, anchorID) {
69     cite = document.getElementById(citeID);
71     // alert(cite.offsetWidth + " by " + cite.offsetHeight)
73     horizontalOffset = getLeft(anchorID);
74     // horizontalOffset = (inMain(anchorID)) ?
75     // (horizontalOffset - 260) : (horizontalOffset + 20)
76     // (horizontalOffset - (20 + cite.offsetWidth)) : (horizontalOffset + 20)
78     verticalOffset = getTop(anchorID);
79     // verticalOffset = (inMain(anchorID)) ?
80     // (verticalOffset - 20) : (verticalOffset + 20)
81     // (verticalOffset - (20 + cite.offsetHeight)) : (verticalOffset + 20)
83     /*
84     horizontalOffset = getAbsoluteLeft(anchorID) - getScrollLeft(anchorID) + 20;
85     if (inMain(anchorID)) {
86       horizontalOffset = horizontalOffset - 300;
87     }
88     verticalOffset = getAbsoluteTop(anchorID) - getScrollTop(anchorID) - 40;
89     if (inMain(anchorID)) {
90       verticalOffset = verticalOffset - 300;
91     }
92     */
94 = horizontalOffset + "px";
95 = verticalOffset + "px";
96   }
98   function getLeft(objectID) {
99     var left = getAbsoluteLeft(objectID) - getScrollLeft(objectID);
100     left = (inMain(objectID)) ? (left - 260) : (left + 20)   
101     return left;
102   }
104   function getTop(objectID) {
105     var top = getAbsoluteTop(objectID) - getScrollTop(objectID);
106     top = (inMain(objectID)) ? (top - 50) : (top + 20)
107     return top;     
108   }
110   function getAbsoluteLeft(objectId) {
111   // Get an object left position from the upper left viewport corner
112     o = document.getElementById(objectId)
113     oLeft = o.offsetLeft            // Get left position from the parent object
114     while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
115       oParent = o.offsetParent    // Get parent object reference
116       oLeft += oParent.offsetLeft // Add parent left position
117       o = oParent
118      }
119    return oLeft
120    }
122    function getAbsoluteTop(objectId) {
123    // Get an object top position from the upper left viewport corner
124      o = document.getElementById(objectId)
125      oTop = o.offsetTop            // Get top position from the parent object
126      while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
127        oParent = o.offsetParent  // Get parent object reference
128        oTop += oParent.offsetTop // Add parent top position
129        o = oParent
130      }
131    return oTop
132    }
134   function getScrollLeft(objectId) {
135     // Get a left scroll position
136     o = document.getElementById(objectId)
137     oLeft = o.scrollLeft            // Get left position from the parent object
138     while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
139       oParent = o.offsetParent    // Get parent object reference
140       oLeft += oParent.scrollLeft // Add parent left position
141       o = oParent
142      }
143    return oLeft
144    }
146    function getScrollTop(objectId) {
147    // Get a right scroll position
148      o = document.getElementById(objectId)
149      oTop = o.scrollTop            // Get top position from the parent object
150      while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
151        oParent = o.offsetParent  // Get parent object reference
152        oTop += oParent.scrollTop // Add parent top position
153        o = oParent
154      }
155    return oTop
156    }
158    function inMain(objectId) {
159    // returns true if in div#main
160      o = document.getElementById(objectId)
161      while(o.parentNode != null) { // Parse the parent hierarchy up to div#main
162        oParent = o.parentNode
163        if ( == "main") { return true; }
164        o = oParent;
165      }
166    return false;
167    }
170   /*
171   function showcite(citeID) {
172      cite = document.getElementById(citeID);
173      if ( != "table-cell") {
174 = "table-cell";
175      }
176      else {
177 = "none";
178      };
179      return;
180    }
181    */
183      </script>
186<div class="inline-citation" id="cite-CameronHerdyLin2008" style="display:none;width: 240px">
187<a class="quiet" href="javascript:hidecite('cite-CameronHerdyLin2008')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D., Herdy, Kenneth S. and Lin, Dan. High performance XML parsing using parallel bit stream technology. CASCON'08: Proc. 2008 conference of the center for advanced studies on collaborative research. Richmond Hill, Ontario, Canada. 2008.</p>
189<div class="inline-citation" id="cite-papi" style="display:none;width: 240px">
190<a class="quiet" href="javascript:hidecite('cite-papi')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Innovative Computing Laboratory, University of Texas. Performance Application Programming Interface.<a href="" class="link" target="_new"></a></p>
192<div class="inline-citation" id="cite-perf" style="display:none;width: 240px">
193<a class="quiet" href="javascript:hidecite('cite-perf')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Eranian, Stephane, Gouriou, Eric, Moseley, Tipp and Bruijn, Willem de. Linux kernel profiling with perf. <a href="" class="link" target="_new"></a></p>
195<div class="inline-citation" id="cite-Cameron2008" style="display:none;width: 240px">
196<a class="quiet" href="javascript:hidecite('cite-Cameron2008')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D.. A case study in SIMD text processing with parallel bit streams: UTF-8 to UTF-16 transcoding. Proc. 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming. Salt Lake City, USA. 2008.</p>
198<div class="inline-citation" id="cite-ParaDOM2009" style="display:none;width: 240px">
199<a class="quiet" href="javascript:hidecite('cite-ParaDOM2009')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Shah, Bhavik, Rao, Praveen, Moon, Bongki and Rajagopalan, Mohan. A Data Parallel Algorithm for XML DOM Parsing. Database and XML Technologies. 2009.</p>
201<div class="inline-citation" id="cite-XMLSSE42" style="display:none;width: 240px">
202<a class="quiet" href="javascript:hidecite('cite-XMLSSE42')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Lei, Zhai. XML Parsing Accelerator with Intel Streaming SIMD Extensions 4 (Intel SSE4). <a href="Intel%20Software%20Network" class="link" target="_new">Intel Software Network</a>.  2008.</p>
204<div class="inline-citation" id="cite-Cameron2009" style="display:none;width: 240px">
205<a class="quiet" href="javascript:hidecite('cite-Cameron2009')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Rob, Herdy, Ken and Amiri, Ehsan Amiri. Parallel Bit Stream Technology as a Foundation for XML Parsing Performance. Int'l Symposium on Processing XML Efficiently: Overcoming Limits on Space, Time, or Bandwidth. Montreal, Quebec, Canada.  2009.</p>
207<div class="inline-citation" id="cite-HilewitzLee2006" style="display:none;width: 240px">
208<a class="quiet" href="javascript:hidecite('cite-HilewitzLee2006')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Hilewitz, Yedidya and Lee, Ruby B.. Fast Bit Compression and Expansion with Parallel Extract and Parallel Deposit Instructions. ASAP '06: Proc. IEEE 17th Int'l Conference on Application-specific Systems, Architectures and Processors. Steamboat Springs, Colorado, USA.  2006.</p>
210<div class="inline-citation" id="cite-Asanovic-EECS-2006-183" style="display:none;width: 240px">
211<a class="quiet" href="javascript:hidecite('cite-Asanovic-EECS-2006-183')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Asanovic, Krste and others. The Landscape of Parallel Computing Research: A View from Berkeley. EECS Department, University of California, Berkeley.  2006.</p>
213<div class="inline-citation" id="cite-GRID2006" style="display:none;width: 240px">
214<a class="quiet" href="javascript:hidecite('cite-GRID2006')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Lu, Wei, Chiu, Kenneth and Pan, Yinfei. A Parallel Approach to XML Parsing. Proceedings of the 7th IEEE/ACM International Conference on Grid Computing. Barcelona, Spain.  2006.</p>
216<div class="inline-citation" id="cite-cameron-EuroPar2011" style="display:none;width: 240px">
217<a class="quiet" href="javascript:hidecite('cite-cameron-EuroPar2011')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D., Amiri, Ehsan, Herdy, Kenneth S., Lin, Dan, Shermer, Thomas C. and Popowich, Fred P.. Parallel Scanning with Bitstream Addition: An XML Case Study. Euro-Par 2011, LNCS 6853, Part II.  Bordeaux, Frane. 2011.</p>
219<div class="inline-citation" id="cite-HPCA2012" style="display:none;width: 240px">
220<a class="quiet" href="javascript:hidecite('cite-HPCA2012')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Lin, Dan, Medforth, Nigel, Herdy, Kenneth S., Shriraman, Arrvindh and Cameron, Rob. Parabix: Boosting the efficiency of text processing on commodity processors. International Symposium on High-Performance Computer Architecture. New Orleans, LA. 2012.</p>
222<div class="inline-citation" id="cite-HPCC2011" style="display:none;width: 240px">
223<a class="quiet" href="javascript:hidecite('cite-HPCC2011')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">You, Cheng-Han and Wang, Sheng-De. A Data Parallel Approach to XML Parsing and Query. 10th IEEE International Conference on High Performance Computing and Communications. Banff, Alberta, Canada. 2011.</p>
225<div class="inline-citation" id="cite-E-SCIENCE2007" style="display:none;width: 240px">
226<a class="quiet" href="javascript:hidecite('cite-E-SCIENCE2007')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Pan, Yinfei, Zhang, Ying, Chiu, Kenneth and Lu, Wei. Parallel XML Parsing Using Meta-DFAs. International Conference on e-Science and Grid Computing.   Bangalore, India.  2007.</p>
228<div class="inline-citation" id="cite-ICWS2008" style="display:none;width: 240px">
229<a class="quiet" href="javascript:hidecite('cite-ICWS2008')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Pan, Yinfei, Zhang, Ying and Chiu, Kenneth. Hybrid Parallelism for XML SAX Parsing. IEEE International Conference on Web Services. Beijing, China.  2008.</p>
231<div class="inline-citation" id="cite-IPDPS2008" style="display:none;width: 240px">
232<a class="quiet" href="javascript:hidecite('cite-IPDPS2008')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Pan, Yinfei, Zhang, Ying and Chiu, Kenneth. Simultaneous transducers for data-parallel XML parsing. International Parallel and Distributed Processing Symposium. Miami, Florida, USA.  2008.</p>
234<div class="inline-citation" id="cite-HackersDelight" style="display:none;width: 240px">
235<a class="quiet" href="javascript:hidecite('cite-HackersDelight')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Warren, Henry S.. Hacker's Delight. Addison-Wesley Professional. 2003.</p>
237<div class="inline-citation" id="cite-lu2007advances" style="display:none;width: 240px">
238<a class="quiet" href="javascript:hidecite('cite-lu2007advances')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Lu, C.T., Dos Santos, R.F., Sripada, L.N. and Kou, Y.. Advances in GML for geospatial applications. Geoinformatica 11:131-157.  2007.</p>
240<div class="inline-citation" id="cite-lake2004geography" style="display:none;width: 240px">
241<a class="quiet" href="javascript:hidecite('cite-lake2004geography')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Lake, R., Burggraf, D.S., Trninic, M. and Rae, L.. Geography mark-up language (GML) [foundation for the geo-web]. Wiley.  Chichester.  2004.</p>
243<div id="mast"><div class="content">
244<h2 class="article-title" id="idp72224">icXML:  Accelerating a Commercial XML
245     Parser Using SIMD and Multicore Technologies</h2>
246<div class="author">
247<h3 class="author">Nigel Medforth</h3>
248<div class="affiliation">
249<p class="jobtitle">Developer</p>
250<p class="orgname">International Characters Inc.</p>
252<div class="affiliation">
253<p class="jobtitle">Graduate Student</p>
254<p class="orgname">School of Computing Science, Simon Fraser University </p>
256<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
258<div class="author">
259<h3 class="author">Dan Lin</h3>
260<div class="affiliation">
261<p class="jobtitle">Graduate Student</p>
262<p class="orgname">School of Computing Science, Simon Fraser University </p>
264<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
266<div class="author">
267<h3 class="author">Kenneth Herdy</h3>
268<div class="affiliation">
269<p class="jobtitle">Graduate Student</p>
270<p class="orgname">School of Computing Science, Simon Fraser University </p>
272<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
274<div class="author">
275<h3 class="author">Rob Cameron</h3>
276<div class="affiliation">
277<p class="jobtitle">Professor of Computing Science</p>
278<p class="orgname">Simon Fraser University</p>
280<div class="affiliation">
281<p class="jobtitle">Chief Technology Officer</p>
282<p class="orgname">International Characters, Inc.</p>
284<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
286<div class="author">
287<h3 class="author">Arrvindh Shriraman</h3>
288<div class="affiliation">
289<p class="jobtitle">Assistant Professor</p>
290<p class="orgname">School of Computing Science, Simon Fraser University</p>
292<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
294<div class="legalnotice-block"><p id="idp281008">Copyright © 2013 Nigel Medforth, Dan Lin, Kenneth S. Herdy, Robert D. Cameron  and Arrvindh Shriraman.
295            This work is licensed under a Creative Commons Attribution-Noncommercial-No Derivative
296            Works 2.5 Canada License.</p></div>
297<div class="mast-box">
298<p class="title"><a href="javascript:toggle('idp72960')" class="quiet"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp72960"></a> <span onclick="javascript:toggle('idp72960');return true">Abstract</span></p>
299<div class="folder" id="folder-idp72960" style="display:none"><p id="idp73264">Prior research on the acceleration of XML processing using single-instruction
300           multiple-data (SIMD) and multi-core
301            parallelism has lead to a number of interesting research prototypes. This work is
302            the first to investigate to the extent to which the techniques underlying these prototypes
303            could result
304            in systematic performance benefits when fully integrated into a commercial XML parser
305            The widely used Xerces-C++ parser of the Apache Software Foundation was chosen as the
306            foundation for the study. A systematic restructuring of the parser was undertaken, while
307            maintaining the existing API for application programmers. Using SIMD techniques alone,
308            an increase in parsing speed of at least 50% was observed in a range of applications.
309            When coupled with pipeline parallelism on dual core processors, improvements of 2x and
310            beyond were realized.
312            icXML is intended as an important industrial contribution in its own right as well
313            as an important case study for the underlying Parabix parallel processing framework.
314            Based on the success of the icXML development, there is a strong case for continued
315            development of that framework as well as for the application of that framework
316            to other important XML technology stacks.   An important area for further work is
317            the extension of Parabix technology to accelerate Java-based implementations as
318            well as ones based on C/C++.
320            </p></div>
322<div class="toc">
323<p><b>Table of Contents</b></p>
325<dt><span class="section"><a href="#idp283056" class="toc">Introduction</a></span></dt>
326<dt><span class="section"><a href="#background" class="toc">Background</a></span></dt>
328<dt><span class="section"><a href="#background-xerces" class="toc">Xerces C++ Structure</a></span></dt>
329<dt><span class="section"><a href="#idp358672" class="toc">The Parabix Framework</a></span></dt>
330<dt><span class="section"><a href="#idp453856" class="toc">Sequential vs. Parallel Paradigm</a></span></dt>
332<dt><span class="section"><a href="#architecture" class="toc">Architecture</a></span></dt>
334<dt><span class="section"><a href="#idp460944" class="toc">Overview</a></span></dt>
335<dt><span class="section"><a href="#character-set-adapter" class="toc">Character Set Adapters</a></span></dt>
336<dt><span class="section"><a href="#par-filter" class="toc">Combined Parallel Filtering</a></span></dt>
337<dt><span class="section"><a href="#contentstream" class="toc">Content Stream</a></span></dt>
338<dt><span class="section"><a href="#namespace-handling" class="toc">Namespace Handling</a></span></dt>
339<dt><span class="section"><a href="#errorhandling" class="toc">Error Handling</a></span></dt>
341<dt><span class="section"><a href="#multithread" class="toc">Multithreading with Pipeline Parallelism</a></span></dt>
342<dt><span class="section"><a href="#performance" class="toc">Performance</a></span></dt>
344<dt><span class="section"><a href="#idp667552" class="toc">Xerces C++ SAXCount</a></span></dt>
345<dt><span class="section"><a href="#idp693968" class="toc">GML2SVG</a></span></dt>
347<dt><span class="section"><a href="#conclusion" class="toc">Conclusion and Future Work</a></span></dt>
350<div class="mast-box">
351<p class="title"><a href="javascript:toggle('idp75328')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp75328"></a> <span onclick="javascript:toggle('idp75328');return true">Nigel Medforth</span></p>
352<div class="folder" id="folder-idp75328" style="display:none">
353<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
354<div class="affiliation">
355<p class="jobtitle">Developer</p>
356<p class="orgname">International Characters Inc.</p>
358<div class="affiliation">
359<p class="jobtitle">Graduate Student</p>
360<p class="orgname">School of Computing Science, Simon Fraser University </p>
362<div class="personblurb">
363<p id="idp56944">Nigel Medforth is a M.Sc. student at Simon Fraser University and the lead
364               developer of icXML. He earned a Bachelor of Technology in Information Technology at
365               Kwantlen Polytechnic University in 2009 and was awarded the Dean’s Medal for
366               Outstanding Achievement.</p>
367<p id="idp57952">Nigel is currently researching ways to leverage both the Parabix framework and
368               stream-processing models to further accelerate XML parsing within icXML.</p>
372<div class="mast-box">
373<p class="title"><a href="javascript:toggle('idp61600')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp61600"></a> <span onclick="javascript:toggle('idp61600');return true">Dan Lin</span></p>
374<div class="folder" id="folder-idp61600" style="display:none">
375<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
376<div class="affiliation">
377<p class="jobtitle">Graduate Student</p>
378<p class="orgname">School of Computing Science, Simon Fraser University </p>
380<div class="personblurb"><p id="idp63312">Dan Lin is a Ph.D student at Simon Fraser University. She earned a Master of Science
381             in Computing Science at Simon Fraser University in 2010. Her research focus on on high
382             performance algorithms that exploit parallelization strategies on various multicore platforms.
383           </p></div>
386<div class="mast-box">
387<p class="title"><a href="javascript:toggle('idp65856')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp65856"></a> <span onclick="javascript:toggle('idp65856');return true">Kenneth Herdy</span></p>
388<div class="folder" id="folder-idp65856" style="display:none">
389<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
390<div class="affiliation">
391<p class="jobtitle">Graduate Student</p>
392<p class="orgname">School of Computing Science, Simon Fraser University </p>
394<div class="personblurb">
395<p id="idp268064"> Ken Herdy completed an Advanced Diploma of Technology in Geographical Information
396               Systems at the British Columbia Institute of Technology in 2003 and earned a Bachelor
397               of Science in Computing Science with a Certificate in Spatial Information Systems at
398               Simon Fraser University in 2005. </p>
399<p id="idp268800"> Ken is currently pursuing PhD studies in Computing Science at Simon Fraser
400               University with industrial scholarship support from the Natural Sciences and
401               Engineering Research Council of Canada, the Mathematics of Information Technology and
402               Complex Systems NCE, and the BC Innovation Council. His research focus is an analysis
403               of the principal techniques that may be used to improve XML processing performance in
404               the context of the Geography Markup Language (GML). </p>
408<div class="mast-box">
409<p class="title"><a href="javascript:toggle('idp271536')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp271536"></a> <span onclick="javascript:toggle('idp271536');return true">Rob Cameron</span></p>
410<div class="folder" id="folder-idp271536" style="display:none">
411<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
412<div class="affiliation">
413<p class="jobtitle">Professor of Computing Science</p>
414<p class="orgname">Simon Fraser University</p>
416<div class="affiliation">
417<p class="jobtitle">Chief Technology Officer</p>
418<p class="orgname">International Characters, Inc.</p>
420<div class="personblurb"><p id="idp273200">Dr. Rob Cameron is Professor of Computing Science and Associate Dean of Applied
421               Sciences at Simon Fraser University. His research interests include programming
422               language and software system technology, with a specific focus on high performance
423               text processing using SIMD and multicore parallelism. He is the developer of the REX
424               XML shallow parser as well as the parallel bit stream (Parabix) framework for SIMD
425               text processing. </p></div>
429<div id="navbar"></div>
430<div id="balisage-header" style="background-color: #6699CC">
431<a class="quiet" href=""><img style="float:right;border:none" alt="Balisage logo" height="130" src=""></a><h2 class="page-header">Balisage: The Markup Conference</h2>
432<h1 class="page-header">Proceedings preview</h1>
434<div id="main">
435<div class="article">
436<h2 class="article-title" id="idp72224">icXML:  Accelerating a Commercial XML
437     Parser Using SIMD and Multicore Technologies</h2>
438<div class="section" id="idp283056">
439<h2 class="title" style="clear: both">Introduction</h2>
440<p id="idp283696">   
441        Parallelization and acceleration of XML parsing is a widely
442        studied problem that has seen the development of a number
443        of interesting research prototypes using both single-instruction
444           multiple-data (SIMD) and
445        multi-core parallelism.   Most works have investigated
446        data parallel solutions on multicore
447        architectures using various strategies to break input
448        documents into segments that can be allocated to different cores.
449        For example, one possibility for data
450        parallelization is to add a pre-parsing step to compute
451        a skeleton tree structure of an  XML document <a class="xref" id="idp284544" href="javascript:showcite('cite-GRID2006','idp284544')">Lu and Chiu 2006</a>.
452        The parallelization of the pre-parsing stage itself can be tackled with
453          state machines <a class="xref" id="idp297680" href="javascript:showcite('cite-E-SCIENCE2007','idp297680')">Pan and Zhang 2007</a>, <a class="xref" id="idp298432" href="javascript:showcite('cite-IPDPS2008','idp298432')">Pan and Zhang 2008b</a>.
454        Methods without pre-parsing have used speculation <a class="xref" id="idp299248" href="javascript:showcite('cite-HPCC2011','idp299248')">You and Wang 2011</a> or post-processing that
455        combines the partial results <a class="xref" id="idp300080" href="javascript:showcite('cite-ParaDOM2009','idp300080')">Shah and Rao 2009</a>.
456        A hybrid technique that combines data and pipeline parallelism was proposed to
457        hide the latency of a "job" that has to be done sequentially <a class="xref" id="idp300944" href="javascript:showcite('cite-ICWS2008','idp300944')">Pan and Zhang 2008a</a>.
458      </p>
459<p id="idp301824">
460        Fewer efforts have investigated SIMD parallelism, although this approach
461        has the potential advantage of improving single core performance as well
462        as offering savings in energy consumption <a class="xref" id="idp302288" href="javascript:showcite('cite-HPCA2012','idp302288')">Lin and Medforth 2012</a>.
463        Intel introduced specialized SIMD string processing instructions in the SSE 4.2 instruction set extension
464        and showed how they can be used to improve the performance of XML parsing <a class="xref" id="idp303232" href="javascript:showcite('cite-XMLSSE42','idp303232')">Lei 2008</a>.
465        The Parabix framework uses generic SIMD extensions and bit parallel methods to
466        process hundreds of XML input characters simultaneously <a class="xref" id="idp304144" href="javascript:showcite('cite-Cameron2009','idp304144')">Balisage 2009</a> <a class="xref" id="idp304896" href="javascript:showcite('cite-cameron-EuroPar2011','idp304896')">Parabix2 2011</a>.
467        Parabix prototypes have also combined SIMD methods with thread-level parallelism to
468        achieve further acceleration on multicore systems <a class="xref" id="idp305808" href="javascript:showcite('cite-HPCA2012','idp305808')">Lin and Medforth 2012</a>.
469      </p>
470<p id="idp306576">
471        In this paper, we move beyond research prototypes to consider
472        the detailed integration of both SIMD and multicore parallelism into the
473        Xerces-C++ parser of the Apache Software Foundation, an existing
474        standards-compliant open-source parser that is widely used
475        in commercial practice.    The challenge of this work is
476        to parallelize the Xerces parser in such a way as to
477        preserve the existing APIs as well as offering worthwhile
478        end-to-end acceleration of XML processing.   
479        To achieve the best results possible, we undertook
480        a nine-month comprehensive restructuring of the Xerces-C++ parser,
481        seeking to expose as many critical aspects of XML parsing
482        as possible for parallelization, the result of which we named icXML.   
483        Overall, we employed Parabix-style methods of transcoding, tokenization
484        and tag parsing, parallel string comparison methods in symbol
485        resolution, bit parallel methods in namespace processing,
486        as well as staged processing using pipeline parallelism to take advantage of
487        multiple cores.
488      </p>
489<p id="idp308016">
490        The remainder of this paper is organized as follows.   
491          <a class="xref" href="#background" title="Background">Section “Background”</a> discusses the structure of the Xerces and Parabix XML parsers and the fundamental
492        differences between the two parsing models.   
493        <a class="xref" href="#architecture" title="Architecture">Section “Architecture”</a> then presents the icXML design based on a restructured Xerces architecture to
494        incorporate SIMD parallelism using Parabix methods.   
495        <a class="xref" href="#multithread" title="Multithreading with Pipeline Parallelism">Section “Multithreading with Pipeline Parallelism”</a> moves on to consider the multithreading of the icXML architecture
496        using the pipeline parallelism model. 
497        <a class="xref" href="#performance" title="Performance">Section “Performance”</a> analyzes the performance of both the single-threaded and
498        multi-threaded versions of icXML in comparison to original Xerces,
499        demonstrating substantial end-to-end acceleration of
500        a GML-to-SVG translation application written against the Xerces API.
501          <a class="xref" href="#conclusion" title="Conclusion and Future Work">Section “Conclusion and Future Work”</a> concludes the paper with a discussion of future work and the potential for
502        applying the techniques discussed herein in other application domains.
503      </p>
505<div class="section" id="background">
506<h2 class="title" style="clear: both">Background</h2>
507<div class="section" id="background-xerces">
508<h3 class="title" style="clear: both">Xerces C++ Structure</h3>
509<p id="idp315600"> The Xerces C++ parser is a widely-used standards-conformant
510            XML parser produced as open-source software
511             by the Apache Software Foundation.
512            It features comprehensive support for a variety of character encodings both
513            commonplace (e.g., UTF-8, UTF-16) and rarely used (e.g., EBCDIC), support for multiple
514            XML vocabularies through the XML namespace mechanism, as well as complete
515            implementations of structure and data validation through multiple grammars declared
516            using either legacy DTDs (document type definitions) or modern XML Schema facilities.
517            Xerces also supports several APIs for accessing parser services, including event-based
518            parsing using either pull parsing or SAX/SAX2 push-style parsing as well as a DOM
519            tree-based parsing interface. </p>
520<p id="idp317728">
521            Xerces,
522            like all traditional parsers, processes XML documents sequentially a byte-at-a-time from
523            the first to the last byte of input data. Each byte passes through several processing
524            layers and is classified and eventually validated within the context of the document
525            state. This introduces implicit dependencies between the various tasks within the
526            application that make it difficult to optimize for performance. As a complex software
527              system, no one feature dominates the overall parsing performance. <a class="xref" href="#xerces-profile">Table I</a>
528            shows the execution time profile of the top ten functions in a
529            typical run. Even if it were possible, Amdahl's Law dictates that tackling any one of
530            these functions for parallelization in isolation would only produce a minute improvement
531            in performance. Unfortunately, early investigation into these functions found that
532            incorporating speculation-free thread-level parallelization was impossible and they were
533            already performing well in their given tasks; thus only trivial enhancements were
534            attainable. In order to obtain a systematic acceleration of Xerces, it should be
535            expected that a comprehensive restructuring is required, involving all aspects of the
536            parser. </p>
537<div class="table-wrapper" id="xerces-profile">
538<p class="title">Table I</p>
539<div class="caption"><p id="idm849856">Execution Time of Top 10 Xerces Functions</p></div>
540<table class="table" xml:id="xerces-profile">
541<colgroup span="1">
542<col align="left" valign="top" span="1">
543<col align="left" valign="top" span="1">
546<th>Time (%) </th>
547<th> Function Name </th>
550<tr valign="top">
551<td>13.29       </td>
552<td>XMLUTF8Transcoder::transcodeFrom </td>
554<tr valign="top">
555<td>7.45        </td>
556<td>IGXMLScanner::scanCharData </td>
558<tr valign="top">
559<td>6.83        </td>
560<td>memcpy </td>
562<tr valign="top">
563<td>5.83        </td>
564<td>XMLReader::getNCName </td>
566<tr valign="top">
567<td>4.67        </td>
568<td>IGXMLScanner::buildAttList </td>
570<tr valign="top">
571<td>4.54        </td>
572<td>RefHashTableO&lt;&gt;::findBucketElem </td>
574<tr valign="top">
575<td>4.20        </td>
576<td>IGXMLScanner::scanStartTagNS </td>
578<tr valign="top">
579<td>3.75        </td>
580<td>ElemStack::mapPrefixToURI </td>
582<tr valign="top">
583<td>3.58        </td>
584<td>ReaderMgr::getNextChar </td>
586<tr valign="top">
587<td>3.20        </td>
588<td>IGXMLScanner::basicAttrValueScan </td>
594<div class="section" id="idp358672">
595<h3 class="title" style="clear: both">The Parabix Framework</h3>
596<p id="idp359344"> The Parabix (parallel bit stream) framework is a transformative approach to XML
597            parsing (and other forms of text processing.) The key idea is to exploit the
598            availability of wide SIMD registers (e.g., 128-bit) in commodity processors to represent
599            data from long blocks of input data by using one register bit per single input byte. To
600            facilitate this, the input data is first transposed into a set of basis bit streams.
601              For example, <a class="xref" href="#xml-bytes">Table II</a> shows  the ASCII bytes for the string "<code class="code">b7&lt;A</code>" with
602                the corresponding  8 basis bit streams, b<sub>0</sub> through  b<sub>7</sub> shown in  <a class="xref" href="#xml-bits">Table III</a>.
603            The bits used to construct b<sub>7</sub> have been highlighted in this example.
604              Boolean-logic operations (∧, √ and ¬ denote the
605              boolean AND, OR and NOT operators) are used to classify the input bits into a set of
606               <span class="ital">character-class bit streams</span>, which identify key
607            characters (or groups of characters) with a <code class="code">1</code>. For example, one of the
608            fundamental characters in XML is a left-angle bracket. A character is an
609               <code class="code">'&lt;' if and only if
610               Â¬(b<sub>0</sub> âˆš b<sub>1</sub>)
611               âˆ§ (b<sub>2</sub> âˆ§ b<sub>3</sub>)
612               âˆ§ (b<sub>4</sub> âˆ§ b<sub>5</sub>)
613               âˆ§ ¬ (b<sub>6</sub> âˆš
614               b<sub>7</sub>) = 1</code>. Similarly, a character is numeric, <code class="code">[0-9]
615               if and only if ¬(b<sub>0</sub> âˆš
616               b<sub>1</sub>) ∧ (b<sub>2</sub> âˆ§
617                  b<sub>3</sub>) ∧ ¬(b<sub>4</sub>
618               âˆ§ (b<sub>5</sub> âˆš
619            b<sub>6</sub>))</code>. An important observation here is that ranges of
620            characters may require fewer operations than individual characters and
621             multiple
622            classes can share the classification cost. </p>
623<div class="table-wrapper" id="xml-bytes">
624<p class="title">Table II</p>
625<div class="caption"><p id="idp373984">XML Source Data</p></div>
626<table class="table" xml:id="xml-bytes">
627<colgroup span="1">
628<col align="right" valign="top" span="1">
629<col align="centre" valign="top" span="1">
630<col align="centre" valign="top" span="1">
631<col align="centre" valign="top" span="1">
632<col align="centre" valign="top" span="1">
636<td>String </td>
637<td> <code class="code">b</code> </td>
638<td> <code class="code">7</code> </td>
639<td> <code class="code">&lt;</code> </td>
640<td> <code class="code">A</code> </td>
643<td>ASCII </td>
644<td> <code class="code">0110001<span class="bold">0</span></code> </td>
645<td> <code class="code">0011011<span class="bold">1</span></code> </td>
646<td> <code class="code">0011110<span class="bold">0</span></code> </td>
647<td> <code class="code">0100000<span class="bold">1</span></code> </td>
652<div class="table-wrapper" id="xml-bits">
653<p class="title">Table III</p>
654<div class="caption"><p id="idp390256">8-bit ASCII Basis Bit Streams</p></div>
655<table class="table" xml:id="xml-bits">
656<colgroup span="1">
657<col align="centre" valign="top" span="1">
658<col align="centre" valign="top" span="1">
659<col align="centre" valign="top" span="1">
660<col align="centre" valign="top" span="1">
661<col align="centre" valign="top" span="1">
662<col align="centre" valign="top" span="1">
663<col align="centre" valign="top" span="1">
664<col align="centre" valign="top" span="1">
668<td> b<sub>0</sub> </td>
669<td> b<sub>1</sub> </td>
670<td> b<sub>2</sub> </td>
671<td> b<sub>3</sub>
673<td> b<sub>4</sub> </td>
674<td> b<sub>5</sub> </td>
675<td> b<sub>6</sub> </td>
676<td> b<sub>7</sub> </td>
679<td> <code class="code">0</code> </td>
680<td> <code class="code">1</code> </td>
681<td> <code class="code">1</code> </td>
682<td> <code class="code">0</code> </td>
683<td> <code class="code">0</code> </td>
684<td> <code class="code">0</code> </td>
685<td> <code class="code">1</code> </td>
686<td> <span class="bold"><code class="code">0</code></span> </td>
689<td> <code class="code">0</code> </td>
690<td> <code class="code">0</code> </td>
691<td> <code class="code">1</code> </td>
692<td> <code class="code">1</code> </td>
693<td> <code class="code">0</code> </td>
694<td> <code class="code">1</code> </td>
695<td> <code class="code">1</code> </td>
696<td> <span class="bold"><code class="code">1</code></span> </td>
699<td> <code class="code">0</code> </td>
700<td> <code class="code">0</code> </td>
701<td> <code class="code">1</code> </td>
702<td> <code class="code">1</code> </td>
703<td> <code class="code">1</code> </td>
704<td> <code class="code">1</code> </td>
705<td> <code class="code">0</code> </td>
706<td> <span class="bold"><code class="code">0</code></span> </td>
709<td> <code class="code">0</code> </td>
710<td> <code class="code">1</code> </td>
711<td> <code class="code">0</code> </td>
712<td> <code class="code">0</code> </td>
713<td> <code class="code">0</code> </td>
714<td> <code class="code">0</code> </td>
715<td> <code class="code">0</code> </td>
716<td> <span class="bold"><code class="code">1</code></span> </td>
721<p id="idp429936"> Consider, for example, the XML source data stream shown in the first line of <a class="xref" href="#derived">Table IV</a>.
722The remaining lines of this figure show
723            several parallel bit streams that are computed in Parabix-style parsing, with each bit
724            of each stream in one-to-one correspondence to the source character code units of the
725            input stream. For clarity, 1 bits are denoted with 1 in each stream and 0 bits are
726            represented as underscores. The first bit stream shown is that for the opening angle
727            brackets that represent tag openers in XML. The second and third streams show a
728            partition of the tag openers into start tag marks and end tag marks depending on the
729            character immediately following the opener (i.e., "<code class="code">/</code>") or
730            not. The remaining three lines show streams that can be computed in subsequent parsing
731            (using the technique of bitstream addition <a class="xref" id="idp432880" href="javascript:showcite('cite-cameron-EuroPar2011','idp432880')">Parabix2 2011</a>), namely streams
732            marking the element names, attribute names and attribute values of tags. </p>
733<div class="table-wrapper" id="derived">
734<p class="title">Table IV</p>
735<div class="caption"><p id="idp434432">XML Source Data and Derived Parallel Bit Streams</p></div>
736<table class="table" xml:id="derived">
737<colgroup span="1">
738<col align="centre" valign="top" span="1">
739<col align="left" valign="top" span="1">
743<td> Source Data </td>
744<td> <code class="code"> &lt;document&gt;fee&lt;element a1='fie' a2 = 'foe'&gt;&lt;/element&gt;fum&lt;/document&gt; </code>
748<td> Tag Openers </td>
749<td> <code class="code">1____________1____________________________1____________1__________</code>
753<td> Start Tag Marks </td>
754<td> <code class="code">_1____________1___________________________________________________</code>
758<td> End Tag Marks </td>
759<td> <code class="code">___________________________________________1____________1_________</code>
763<td> Empty Tag Marks </td>
764<td> <code class="code">__________________________________________________________________</code>
768<td> Element Names </td>
769<td> <code class="code">_11111111_____1111111_____________________________________________</code>
773<td> Attribute Names </td>
774<td> <code class="code">______________________11_______11_________________________________</code>
778<td> Attribute Values </td>
779<td> <code class="code">__________________________111________111__________________________</code>
785<p id="idp447200"> Two intuitions may help explain how the Parabix approach can lead to improved XML
786            parsing performance. The first is that the use of the full register width offers a
787            considerable information advantage over sequential byte-at-a-time parsing. That is,
788            sequential processing of bytes uses just 8 bits of each register, greatly limiting the
789            processor resources that are effectively being used at any one time. The second is that
790            byte-at-a-time loop scanning loops are actually often just computing a single bit of
791            information per iteration: is the scan complete yet? Rather than computing these
792            individual decision-bits, an approach that computes many of them in parallel (e.g., 128
793            bytes at a time using 128-bit registers) should provide substantial benefit. </p>
794<p id="idp449312"> Previous studies have shown that the Parabix approach improves many aspects of XML
795            processing, including transcoding <a class="xref" id="idp449712" href="javascript:showcite('cite-Cameron2008','idp449712')">u8u16 2008</a>, character classification and
796            validation, tag parsing and well-formedness checking. The first Parabix parser used
797            processor bit scan instructions to considerably accelerate sequential scanning loops for
798            individual characters <a class="xref" id="idp450560" href="javascript:showcite('cite-CameronHerdyLin2008','idp450560')">Parabix1 2008</a>. Recent work has incorporated a method
799            of parallel scanning using bitstream addition <a class="xref" id="idp451328" href="javascript:showcite('cite-cameron-EuroPar2011','idp451328')">Parabix2 2011</a>, as well as
800            combining SIMD methods with 4-stage pipeline parallelism to further improve throughput
801            <a class="xref" id="idp452064" href="javascript:showcite('cite-HPCA2012','idp452064')">Lin and Medforth 2012</a>. Although these research prototypes handled the full syntax of
802            schema-less XML documents, they lacked the functionality required by full XML parsers. </p>
803<p id="idp453008"> Commercial XML processors support transcoding of multiple character sets and can
804            parse and validate against multiple document vocabularies. Additionally, they provide
805            API facilities beyond those found in research prototypes, including the widely used SAX,
806            SAX2 and DOM interfaces. </p>
808<div class="section" id="idp453856">
809<h3 class="title" style="clear: both">Sequential vs. Parallel Paradigm</h3>
810<p id="idp454544"> Xerces—like all traditional XML parsers—processes XML documents
811            sequentially. Each character is examined to distinguish between the XML-specific markup,
812            such as a left angle bracket <code class="code">"&lt;"</code>, and the content held within the
813            document. As the parser progresses through the document, it alternates between markup
814            scanning, validation and content processing modes. </p>
815<p id="idp456080"> In other words, Xerces belongs to an equivalence class of applications termed FSM
816           applications.<sup class="fn-label"><a href="#FSM" class="footnoteref" id="FSM-ref">[1]</a></sup> Each state transition indicates the processing context of
817            subsequent characters. Unfortunately, textual data tends to be unpredictable and any
818            character could induce a state transition. </p>
819<p id="idp458016"> Parabix-style XML parsers utilize a concept of layered processing. A block of source
820            text is transformed into a set of lexical bitstreams, which undergo a series of
821            operations that can be grouped into logical layers, e.g., transposition, character
822            classification, and lexical analysis. Each layer is pipeline parallel and require
823            neither speculation nor pre-parsing stages <a class="xref" id="idp458704" href="javascript:showcite('cite-HPCA2012','idp458704')">Lin and Medforth 2012</a>. To meet the API requirements
824            of the document-ordered Xerces output, the results of the Parabix processing layers must
825            be interleaved to produce the equivalent behaviour. </p>
828<div class="section" id="architecture">
829<h2 class="title" style="clear: both">Architecture</h2>
830<div class="section" id="idp460944">
831<h3 class="title" style="clear: both">Overview</h3>
832<p id="idp462000"> icXML is more than an optimized version of Xerces. Many components were grouped,
833            restructured and rearchitected with pipeline parallelism in mind. In this section, we
834            highlight the core differences between the two systems. As shown in Figure
835              <a class="xref" href="#xerces-arch" title="Xerces Architecture">Figure 1</a>, Xerces is comprised of five main modules: the transcoder, reader,
836            scanner, namespace binder, and validator. The <span class="ital">Transcoder</span> converts source data into UTF-16 before Xerces parses it as XML;
837            the majority of the character set encoding validation is performed as a byproduct of
838            this process. The <span class="ital">Reader</span> is responsible for the
839            streaming and buffering of all raw and transcoded (UTF-16) text. It tracks the current
840            line/column position,
842            performs line-break normalization and validates context-specific character set issues,
843            such as tokenization of qualified-names. The <span class="ital">Scanner</span>
844            pulls data through the reader and constructs the intermediate representation (IR) of the
845            document; it deals with all issues related to entity expansion, validates the XML
846            well-formedness constraints and any character set encoding issues that cannot be
847            completely handled by the reader or transcoder (e.g., surrogate characters, validation
848            and normalization of character references, etc.) The <span class="ital">Namespace
849               Binder</span> is a core piece of the element stack. It handles namespace scoping
850            issues between different XML vocabularies. This allows the scanner to properly select
851            the correct schema grammar structures. The <span class="ital">Validator</span>
852            takes the IR produced by the Scanner (and potentially annotated by the Namespace Binder)
853            and assesses whether the final output matches the user-defined DTD and schema grammar(s)
854            before passing it to the end-user. </p>
855<div class="figure" id="xerces-arch">
856<p class="title">Figure 1: Xerces Architecture</p>
857<div class="figure-contents">
858<div class="mediaobject" id="idp470016"><img alt="png image (xerces.png)" src="xerces.png" width="155cm"></div>
859<div class="caption"></div>
862<p id="idp472384"> In icXML functions are grouped into logical components. As shown in
863             <a class="xref" href="#xerces-arch" title="Xerces Architecture">Figure 1</a>, two major categories exist: (1) the Parabix Subsystem and (2) the
864               Markup Processor. All tasks in (1) use the Parabix Framework <a class="xref" id="idp473472" href="javascript:showcite('cite-HPCA2012','idp473472')">Lin and Medforth 2012</a>, which
865            represents data as a set of parallel bitstreams. The <span class="ital">Character Set
866              Adapter</span>, discussed in <a class="xref" href="#character-set-adapter" title="Character Set Adapters">Section “Character Set Adapters”</a>, mirrors
867            Xerces's Transcoder duties; however instead of producing UTF-16 it produces a set of
868              lexical bitstreams, similar to those shown in <a class="xref" id="idp475824" href="javascript:showcite('cite-CameronHerdyLin2008','idp475824')">Parabix1 2008</a>. These lexical
869            bitstreams are later transformed into UTF-16 in the Content Stream Generator, after
870            additional processing is performed. The first precursor to producing UTF-16 is the
871               <span class="ital">Parallel Markup Parser</span> phase. It takes the lexical
872            streams and produces a set of marker bitstreams in which a 1-bit identifies significant
873            positions within the input data. One bitstream for each of the critical piece of
874            information is created, such as the beginning and ending of start tags, end tags,
875            element names, attribute names, attribute values and content. Intra-element
876            well-formedness validation is performed as an artifact of this process. Like Xerces,
877            icXML must provide the Line and Column position of each error. The <span class="ital">Line-Column Tracker</span> uses the lexical information to keep track of the
878            document position(s) through the use of an optimized population count algorithm,
879              described in <a class="xref" href="#errorhandling" title="Error Handling">Section “Error Handling”</a>. From here, two data-independent
880            branches exist: the Symbol Resolver and Content Preparation Unit. </p>
881<p id="idp479856"> A typical XML file contains few unique element and attribute names—but
882            each of them will occur frequently. icXML stores these as distinct data structures,
883            called symbols, each with their own global identifier (GID). Using the symbol marker
884            streams produced by the Parallel Markup Parser, the <span class="ital">Symbol
885               Resolver</span> scans through the raw data to produce a sequence of GIDs, called
886            the <span class="ital">symbol stream</span>. </p>
887<p id="idp482560"> The final components of the Parabix Subsystem are the <span class="ital">Content
888               Preparation Unit</span> and <span class="ital">Content Stream
889            Generator</span>. The former takes the (transposed) basis bitstreams and selectively
890            filters them, according to the information provided by the Parallel Markup Parser, and
891            the latter transforms the filtered streams into the tagged UTF-16 <span class="ital">content stream</span>, discussed in <a class="xref" href="#contentstream" title="Content Stream">Section “Content Stream”</a>. </p>
892<p id="idp486160"> Combined, the symbol and content stream form icXML's compressed IR of the XML
893            document. The <span class="ital">Markup Processor</span>
894            parses the IR to
895            validate and produce the sequential output for the end user. The <span class="ital">Final WF checker</span> performs inter-element well-formedness validation that
896            would be too costly to perform in bit space, such as ensuring every start tag has a
897            matching end tag. Xerces's namespace binding functionality is replaced by the <span class="ital">Namespace Processor</span>. Unlike Xerces, it is a discrete phase
898            that produces a series of URI identifiers (URI IDs), the <span class="ital">URI
899               stream</span>, which are associated with each symbol occurrence. This is
900                 discussed in <a class="xref" href="#namespace-handling" title="Namespace Handling">Section “Namespace Handling”</a>. Finally, the <span class="ital">Validation</span> layer implements the Xerces's validator. However,
901            preprocessing associated with each symbol greatly reduces the work of this stage. </p>
902<div class="figure" id="icxml-arch">
903<p class="title">Figure 2: icXML Architecture</p>
904<div class="figure-contents">
905<div class="mediaobject" id="idp492624"><img alt="png image (icxml.png)" src="icxml.png" width="500cm"></div>
906<div class="caption"></div>
910<div class="section" id="character-set-adapter">
911<h3 class="title" style="clear: both">Character Set Adapters</h3>
912<p id="idp496112"> In Xerces, all input is transcoded into UTF-16 to simplify the parsing costs of
913            Xerces itself and provide the end-consumer with a single encoding format. In the
914            important case of UTF-8 to UTF-16 transcoding, the transcoding costs can be significant,
915            because of the need to decode and classify each byte of input, mapping variable-length
916            UTF-8 byte sequences into 16-bit UTF-16 code units with bit manipulation operations. In
917            other cases, transcoding may involve table look-up operations for each byte of input. In
918            any case, transcoding imposes at least a cost of buffer copying. </p>
919<p id="idp497168"> In icXML, however, the concept of Character Set Adapters (CSAs) is used to minimize
920            transcoding costs. Given a specified input encoding, a CSA is responsible for checking
921            that input code units represent valid characters, mapping the characters of the encoding
922            into the appropriate bitstreams for XML parsing actions (i.e., producing the lexical
923            item streams), as well as supporting ultimate transcoding requirements. All of this work
924            is performed using the parallel bitstream representation of the source input. </p>
925<p id="idp35408"> An important observation is that many character sets are an extension to the legacy
926            7-bit ASCII character set. This includes the various ISO Latin character sets, UTF-8,
927            UTF-16 and many others. Furthermore, all significant characters for parsing XML are
928            confined to the ASCII repertoire. Thus, a single common set of lexical item calculations
929            serves to compute lexical item streams for all such ASCII-based character sets. </p>
930<p id="idp36288"> A second observation is that—regardless of which character set is
931            used—quite often all of the characters in a particular block of input will be
932            within the ASCII range. This is a very simple test to perform using the bitstream
933            representation, simply confirming that the bit 0 stream is zero for the entire block.
934            For blocks satisfying this test, all logic dealing with non-ASCII characters can simply
935            be skipped. Transcoding to UTF-16 becomes trivial as the high eight bitstreams of the
936            UTF-16 form are each set to zero in this case. </p>
937<p id="idp38208"> A third observation is that repeated transcoding of the names of XML elements,
938            attributes and so on can be avoided by using a look-up mechanism. That is, the first
939            occurrence of each symbol is stored in a look-up table mapping the input encoding to a
940            numeric symbol ID. Transcoding of the symbol is applied at this time. Subsequent look-up
941            operations can avoid transcoding by simply retrieving the stored representation. As
942            symbol look up is required to apply various XML validation rules, there is achieves the
943            effect of transcoding each occurrence without additional cost. </p>
944<p id="idp39264"> The cost of individual character transcoding is avoided whenever a block of input is
945            confined to the ASCII subset and for all but the first occurrence of any XML element or
946            attribute name. Furthermore, when transcoding is required, the parallel bitstream
947            representation supports efficient transcoding operations. In the important case of UTF-8
948            to UTF-16 transcoding, the corresponding UTF-16 bitstreams can be calculated in bit
949              parallel fashion based on UTF-8 streams <a class="xref" id="idp40064" href="javascript:showcite('cite-Cameron2008','idp40064')">u8u16 2008</a>, and all but the final bytes
950            of multi-byte sequences can be marked for deletion as discussed in the following
951            subsection. In other cases, transcoding within a block only need be applied for
952            non-ASCII bytes, which are conveniently identified by iterating through the bit 0 stream
953            using bit scan operations. </p>
955<div class="section" id="par-filter">
956<h3 class="title" style="clear: both">Combined Parallel Filtering</h3>
957<p id="idp42416"> As just mentioned, UTF-8 to UTF-16 transcoding involves marking all but the last
958            bytes of multi-byte UTF-8 sequences as positions for deletion. For example, the two
959            Chinese characters <code class="code">䜠奜</code> are represented as two
960            three-byte UTF-8 sequences <code class="code">E4 BD A0</code> and <code class="code">E5 A5 BD</code> while the
961            UTF-16 representation must be compressed down to the two code units <code class="code">4F60</code>
962            and <code class="code">597D</code>. In the bit parallel representation, this corresponds to a
963            reduction from six bit positions representing UTF-8 code units (bytes) down to just two
964            bit positions representing UTF-16 code units (double bytes). This compression may be
965            achieved by arranging to calculate the correct UTF-16 bits at the final position of each
966            sequence and creating a deletion mask to mark the first two bytes of each 3-byte
967            sequence for deletion. In this case, the portion of the mask corresponding to these
968            input bytes is the bit sequence <code class="code">110110</code>. Using this approach, transcoding
969            may then be completed by applying parallel deletion and inverse transposition of the
970            UTF-16 bitstreams <a class="xref" id="idp517328" href="javascript:showcite('cite-Cameron2008','idp517328')">u8u16 2008</a>. </p>
971<p id="idp518128"> Rather than immediately paying the costs of deletion and transposition just for
972            transcoding, however, icXML defers these steps so that the deletion masks for several
973            stages of processing may be combined. In particular, this includes core XML requirements
974            to normalize line breaks and to replace character reference and entity references by
975            their corresponding text. In the case of line break normalization, all forms of line
976            breaks, including bare carriage returns (CR), line feeds (LF) and CR-LF combinations
977            must be normalized to a single LF character in each case. In icXML, this is achieved by
978            first marking CR positions, performing two bit parallel operations to transform the
979            marked CRs into LFs, and then marking for deletion any LF that is found immediately
980            after the marked CR as shown by the Pablo source code in
981              <a class="xref" href="#fig-LBnormalization">Figure 3</a>.
982              <div class="figure" id="fig-LBnormalization">
983<p class="title">Figure 3</p>
984<div class="figure-contents">
985<div class="caption">Line Break Normalization Logic</div>
986<pre class="programlisting" id="idp521200">
987# XML 1.0 line-break normalization rules.
988if lex.CR:
989# Modify CR (#x0D) to LF (#x0A)
990  u16lo.bit_5 ^= lex.CR
991  u16lo.bit_6 ^= lex.CR
992  u16lo.bit_7 ^= lex.CR
993  CRLF = pablo.Advance(lex.CR) &amp; lex.LF
994  callouts.delmask |= CRLF
995# Adjust LF streams for line/column tracker
996  lex.LF |= lex.CR
997  lex.LF ^= CRLF
1001         </p>
1002<p id="idp522544"> In essence, the deletion masks for transcoding and for line break normalization each
1003            represent a bitwise filter; these filters can be combined using bitwise-or so that the
1004            parallel deletion algorithm need only be applied once. </p>
1005<p id="idp523200"> A further application of combined filtering is the processing of XML character and
1006           entity references. Consider, for example, the references <code class="code">&amp;amp;</code> or
1007             <code class="code">&amp;#x3C;</code> which must be replaced in XML processing with the single
1008               <code class="code">&amp;</code> and <code class="code">&lt;</code> characters, respectively. The
1009            approach in icXML is to mark all but the first character positions of each reference for
1010            deletion, leaving a single character position unmodified. Thus, for the references
1011               <code class="code">&amp;amp;</code> or <code class="code">&amp;#x3C;</code> the masks <code class="code">01111</code> and
1012               <code class="code">011111</code> are formed and combined into the overall deletion mask. After the
1013            deletion and inverse transposition operations are finally applied, a post-processing
1014            step inserts the proper character at these positions. One note about this process is
1015            that it is speculative; references are assumed to generally be replaced by a single
1016            UTF-16 code unit. In the case, that this is not true, it is addressed in
1017            post-processing. </p>
1018<p id="idp528112"> The final step of combined filtering occurs during the process of reducing markup
1019            data to tag bytes preceding each significant XML transition as described in
1020              <a class="xref" href="#contentstream" title="Content Stream">Section “Content Stream”</a>. Overall, icXML avoids separate buffer copying
1021            operations for each of the these filtering steps, paying the cost of parallel deletion
1022            and inverse transposition only once. Currently, icXML employs the parallel-prefix
1023            compress algorithm of Steele <a class="xref" id="idp529424" href="javascript:showcite('cite-HackersDelight','idp529424')">Warren 2002</a>. Performance is independent of the
1024            number of positions deleted. Future versions of icXML are expected to take advantage of
1025            the parallel extract operation <a class="xref" id="idp530320" href="javascript:showcite('cite-HilewitzLee2006','idp530320')">Hilewitz and Lee 2006</a> that Intel is now providing in its
1026            Haswell architecture. </p>
1028<div class="section" id="contentstream">
1029<h3 class="title" style="clear: both">Content Stream</h3>
1030<p id="idp532368"> A relatively-unique concept for icXML is the use of a filtered content stream.
1031            Rather that parsing an XML document in its original format, the input is transformed
1032            into one that is easier for the parser to iterate through and produce the sequential
1033            output. In <a class="xref" href="#fig-parabix2">Table V</a>, the source data
1034             <code class="code"> &lt;document&gt;fee&lt;element a1='fie' a2 = 'foe'&gt;&lt;/element&gt;fum&lt;/document&gt;</code>
1035             is transformed into <code class="code"><span class="ital">0</span>fee<span class="ital">0</span>=fie<span class="ital">0</span>=foe<span class="ital">0</span>&gt;<span class="ital">0</span>/fum<span class="ital">0</span>/</code>
1036            through the parallel filtering algorithm, described in <a class="xref" href="#par-filter" title="Combined Parallel Filtering">Section “Combined Parallel Filtering”</a>. </p>
1037<div class="table-wrapper" id="fig-parabix2">
1038<p class="title">Table V</p>
1039<div class="caption">XML Source Data and Derived Parallel Bit Streams</div>
1040<table class="table" xml:id="fig-parabix2">
1041<colgroup span="1">
1042<col align="centre" valign="top" span="1">
1043<col align="left" valign="top" span="1">
1047<td> Source Data </td>
1049                                    <code class="code"> &lt;document&gt;fee&lt;element a1='fie' a2 = 'foe'&gt;&lt;/element&gt;fum&lt;/document&gt; </code>
1053<td> String Ends </td>
1054<td> <code class="code">1____________1_______________1__________1_1____________1__________</code>
1058<td> Markup Identifiers </td>
1059<td>         <code class="code">_________1______________1_________1______1_1____________1_________</code>
1063<td> Deletion Mask </td>
1064<td>              <code class="code">_11111111_____1111111111_1____1111_11_______11111111_____111111111</code>
1068<td> Undeleted Data </td>
1069<td> <code class="code"><span class="ital">0</span>________&gt;fee<span class="ital">0</span>__________=_fie<span class="ital">0</span>____=__foe<span class="ital">0</span>&gt;<span class="ital">0</span>/________fum<span class="ital">0</span>/_________</code>
1075<p id="idp553712"> Combined with the symbol stream, the parser traverses the content stream to
1076            effectively reconstructs the input document in its output form. The initial <span class="ital">0</span> indicates an empty content string. The following
1077               <code class="code">&gt;</code> indicates that a start tag without any attributes is the first
1078            element in this text and the first unused symbol, <code class="code">document</code>, is the element
1079            name. Succeeding that is the content string <code class="code">fee</code>, which is null-terminated
1080            in accordance with the Xerces API specification. Unlike Xerces, no memory-copy
1081            operations are required to produce these strings, which as
1082              <a class="xref" href="#xerces-profile">Table I</a> shows accounts for 6.83% of Xerces's execution time.
1083            Additionally, it is cheap to locate the terminal character of each string: using the
1084            String End bitstream, the Parabix Subsystem can effectively calculate the offset of each
1085            null character in the content stream in parallel, which in turn means the parser can
1086            directly jump to the end of every string without scanning for it. </p>
1087<p id="idp557792"> Following <code class="code">'fee'</code> is a <code class="code">=</code>, which marks the
1088            existence of an attribute. Because all of the intra-element was performed in the Parabix
1089            Subsystem, this must be a legal attribute. Since attributes can only occur within start
1090            tags and must be accompanied by a textual value, the next symbol in the symbol stream
1091            must be the element name of a start tag, and the following one must be the name of the
1092            attribute and the string that follows the <code class="code">=</code> must be its value. However, the
1093            subsequent <code class="code">=</code> is not treated as an independent attribute because the parser
1094            has yet to read a <code class="code">&gt;</code>, which marks the end of a start tag. Thus only
1095            one symbol is taken from the symbol stream and it (along with the string value) is added
1096            to the element. Eventually the parser reaches a <code class="code">/</code>, which marks the
1097            existence of an end tag. Every end tag requires an element name, which means they
1098            require a symbol. Inter-element validation whenever an empty tag is detected to ensure
1099            that the appropriate scope-nesting rules have been applied. </p>
1101<div class="section" id="namespace-handling">
1102<h3 class="title" style="clear: both">Namespace Handling</h3>
1103<p id="idp563408"> In XML, namespaces prevents naming conflicts when multiple vocabularies are used
1104            together. It is especially important when a vocabulary application-dependant meaning,
1105            such as when XML or SVG documents are embedded within XHTML files. Namespaces are bound
1106            to uniform resource identifiers (URIs), which are strings used to identify specific
1107            names or resources. On line 1 in <a class="xref" href="#namespace-ex">Table VI</a>, the <code class="code">xmlns</code>
1108            attribute instructs the XML processor to bind the prefix <code class="code">p</code> to the URI
1109               '<code class="code"></code>' and the default (empty) prefix to
1110               <code class="code"></code>. Thus to the XML processor, the <code class="code">title</code> on line 2
1111            and <code class="code">price</code> on line 4 both read as
1112            <code class="code">"":title</code> and
1113               <code class="code">"":price</code> respectively, whereas on line 3 and
1114            5, <code class="code">p:name</code> and <code class="code">price</code> are seen as
1115               <code class="code">"":name</code> and
1116               <code class="code">"":price</code>. Even though the actual element name
1117               <code class="code">price</code>, due to namespace scoping rules they are viewed as two
1118            uniquely-named items because the current vocabulary is determined by the namespace(s)
1119            that are in-scope. </p>
1120<div class="table-wrapper" id="namespace-ex">
1121<p class="title">Table VI</p>
1122<div class="caption"><p id="idp572144">XML Namespace Example</p></div>
1123<table class="table" xml:id="namespace-ex">
1124<colgroup span="1">
1125<col align="centre" valign="top" span="1">
1126<col align="left" valign="top" span="1">
1130<td>1. </td>
1131<td>&lt;book xmlns:p="" xmlns=""&gt; </td>
1134<td>2. </td>
1135<td>  &lt;title&gt;BOOK NAME&lt;/title&gt; </td>
1138<td>3. </td>
1139<td>  &lt;p:name&gt;PUBLISHER NAME&lt;/p:name&gt; </td>
1142<td>4. </td>
1143<td>  &lt;price&gt;X&lt;/price&gt; </td>
1146<td>5. </td>
1147<td>  &lt;price xmlns=""&gt;Y&lt;/price&gt; </td>
1150<td>6. </td>
1151<td>&lt;/book&gt; </td>
1156<p id="idp581088"> In both Xerces and icXML, every URI has a one-to-one mapping to a URI ID. These
1157            persist for the lifetime of the application through the use of a global URI pool. Xerces
1158            maintains a stack of namespace scopes that is pushed (popped) every time a start tag
1159            (end tag) occurs in the document. Because a namespace declaration affects the entire
1160            element, it must be processed prior to grammar validation. This is a costly process
1161            considering that a typical namespaced XML document only comes in one of two forms: (1)
1162            those that declare a set of namespaces upfront and never change them, and (2) those that
1163            repeatedly modify the namespaces in predictable patterns. </p>
1164<p id="idp582224"> For that reason, icXML contains an independent namespace stack and utilizes bit
1165            vectors to cheaply perform
1166             When a prefix is
1167            declared (e.g., <code class="code">xmlns:p=""</code>), a namespace binding
1168            is created that maps the prefix (which are assigned Prefix IDs in the symbol resolution
1169            process) to the URI. Each unique namespace binding has a unique namespace id (NSID) and
1170            every prefix contains a bit vector marking every NSID that has ever been associated with
1171              it within the document. For example, in <a class="xref" href="#namespace-ex">Table VI</a>, the prefix binding
1172            set of <code class="code">p</code> and <code class="code">xmlns</code> would be <code class="code">01</code> and
1173            <code class="code">11</code> respectively. To resolve the in-scope namespace binding for each prefix,
1174            a bit vector of the currently visible namespaces is maintained by the system. By ANDing
1175            the prefix bit vector with the currently visible namespaces, the in-scope NSID can be
1176            found using a bit-scan intrinsic. A namespace binding table, similar to
1177            <a class="xref" href="#namespace-binding">Table VII</a>, provides the actual URI ID. </p>
1178<div class="table-wrapper" id="namespace-binding">
1179<p class="title">Table VII</p>
1180<div class="caption"><p id="idp588848">Namespace Binding Table Example</p></div>
1181<table class="table" xml:id="namespace-binding">
1182<colgroup span="1">
1183<col align="centre" valign="top" span="1">
1184<col align="centre" valign="top" span="1">
1185<col align="centre" valign="top" span="1">
1186<col align="centre" valign="top" span="1">
1187<col align="centre" valign="top" span="1">
1190<th>NSID </th>
1191<th> Prefix </th>
1192<th> URI </th>
1193<th> Prefix ID </th>
1194<th> URI ID </th>
1198<td>0 </td>
1199<td> <code class="code"> p</code> </td>
1200<td> <code class="code"></code> </td>
1201<td> 0 </td>
1202<td> 0 </td>
1205<td>1 </td>
1206<td> <code class="code"> xmlns</code> </td>
1207<td> <code class="code"></code> </td>
1208<td> 1 </td>
1209<td> 1 </td>
1212<td>2 </td>
1213<td> <code class="code"> xmlns</code> </td>
1214<td> <code class="code"></code> </td>
1215<td> 1 </td>
1216<td> 0 </td>
1221<p id="idp605120">
1226         </p>
1227<p id="idp607024"> To ensure that scoping rules are adhered to, whenever a start tag is encountered,
1228            any modification to the currently visible namespaces is calculated and stored within a
1229            stack of bit vectors denoting the locally modified namespace bindings. When an end tag
1230            is found, the currently visible namespaces is XORed with the vector at the top of the
1231            stack. This allows any number of changes to be performed at each scope-level with a
1232            constant time.
1234         </p>
1236<div class="section" id="errorhandling">
1237<h3 class="title" style="clear: both">Error Handling</h3>
1238<p id="idp609456">
1240            Xerces outputs error messages in two ways: through the programmer API and as thrown
1241            objects for fatal errors. As Xerces parses a file, it uses context-dependant logic to
1242            assess whether the next character is legal; if not, the current state determines the
1243            type and severity of the error. icXML emits errors in the similar manner—but
1244            how it discovers them is substantially different. Recall that in Figure
1245            <a class="xref" href="#icxml-arch" title="icXML Architecture">Figure 2</a>, icXML is divided into two sections: the Parabix Subsystem and
1246            Markup Processor, each with its own system for detecting and producing error messages. </p>
1247<p id="idp499280"> Within the Parabix Subsystem, all computations are performed in parallel, a block at
1248            a time. Errors are derived as artifacts of bitstream calculations, with a 1-bit marking
1249            the byte-position of an error within a block, and the type of error is determined by the
1250            equation that discovered it. The difficulty of error processing in this section is that
1251            in Xerces the line and column number must be given with every error production. Two
1252            major issues exist because of this: (1) line position adheres to XML white-normalization
1253            rules; as such, some sequences of characters, e.g., a carriage return followed by a line
1254            feed, are counted as a single new line character. (2) column position is counted in
1255            characters, not bytes or code units; thus multi-code-unit code-points and surrogate
1256            character pairs are all counted as a single column position. Note that typical XML
1257            documents are error-free but the calculation of the line/column position is a constant
1258            overhead in Xerces.  To
1259            reduce this, icXML pushes the bulk cost of the line/column calculation to the occurrence
1260            of the error and performs the minimal amount of book-keeping necessary to facilitate it.
1261            icXML leverages the byproducts of the Character Set Adapter (CSA) module and amalgamates
1262            the information within the Line Column Tracker (LCT). One of the CSA's major
1263            responsibilities is transcoding an input text.
1264             During this process,
1265            white-space normalization rules are applied and multi-code-unit and surrogate characters
1266            are detected and validated. A <span class="ital">line-feed bitstream</span>,
1267            which marks the positions of the normalized new lines characters, is a natural
1268            derivative of this process. Using an optimized population count algorithm, the line
1269            count can be summarized cheaply for each valid block of text.
1270             Column position is more
1271            difficult to calculate. It is possible to scan backwards through the bitstream of new
1272            line characters to determine the distance (in code-units) between the position between
1273            which an error was detected and the last line feed. However, this distance may exceed
1274            than the actual character position for the reasons discussed in (2). To handle this, the
1275            CSA generates a <span class="ital">skip mask</span> bitstream by ORing together
1276            many relevant bitstreams, such as all trailing multi-code-unit and surrogate characters,
1277            and any characters that were removed during the normalization process. When an error is
1278            detected, the sum of those skipped positions is subtracted from the distance to
1279            determine the actual column number. </p>
1280<p id="idp505008"> The Markup Processor is a state-driven machine. As such, error detection within it
1281            is very similar to Xerces. However, reporting the correct line/column is a much more
1282            difficult problem. The Markup Processor parses the content stream, which is a series of
1283            tagged UTF-16 strings. Each string is normalized in accordance with the XML
1284            specification. All symbol data and unnecessary whitespace is eliminated from the stream;
1285            thus its impossible to derive the current location using only the content stream. To
1286            calculate the location, the Markup Processor borrows three additional pieces of
1287            information from the Parabix Subsystem: the line-feed, skip mask, and a <span class="ital">deletion mask stream</span>, which is a bitstream denoting the
1288            (code-unit) position of every datum that was suppressed from the source during the
1289            production of the content stream. Armed with these, it is possible to calculate the
1290            actual line/column using the same system as the Parabix Subsystem until the sum of the
1291            negated deletion mask stream is equal to the current position. </p>
1294<div class="section" id="multithread">
1295<h2 class="title" style="clear: both">Multithreading with Pipeline Parallelism</h2>
1296<p id="idp509296"> As discussed in section <a class="xref" href="#background-xerces" title="Xerces C++ Structure">Section “Xerces C++ Structure”</a>, Xerces can be considered a FSM
1297         application. These are "embarrassingly
1298         sequential."<a class="xref" id="idp510400" href="javascript:showcite('cite-Asanovic-EECS-2006-183','idp510400')">Asanovic et al. 2006</a> and notoriously difficult to
1299         parallelize. However, icXML is designed to organize processing into logical layers. In
1300         particular, layers within the Parabix Subsystem are designed to operate over significant
1301         segments of input data before passing their outputs on for subsequent processing. This fits
1302         well into the general model of pipeline parallelism, in which each thread is in charge of a
1303         single module or group of modules. </p>
1304<p id="idp511712"> The most straightforward division of work in icXML is to separate the Parabix Subsystem
1305         and the Markup Processor into distinct logical layers into two separate stages. The
1306         resultant application, <span class="ital">icXML-p</span>, is a course-grained
1307         software-pipeline application. In this case, the Parabix Subsystem thread
1308               <code class="code">T<sub>1</sub></code> reads 16k of XML input <code class="code">I</code> at a
1309         time and produces the content, symbol and URI streams, then stores them in a pre-allocated
1310         shared data structure <code class="code">S</code>. The Markup Processor thread
1311            <code class="code">T<sub>2</sub></code> consumes <code class="code">S</code>, performs well-formedness
1312         and grammar-based validation, and the provides parsed XML data to the application through
1313         the Xerces API. The shared data structure is implemented using a ring buffer, where every
1314         entry contains an independent set of data streams. In the examples of
1315           <a class="xref" href="#threads_timeline1" title="Thread Balance in Two-Stage Pipelines: Stage 1 Dominant">Figure 4</a>, the ring buffer has four entries. A
1316         lock-free mechanism is applied to ensure that each entry can only be read or written by one
1317         thread at the same time. In  <a class="xref" href="#threads_timeline1" title="Thread Balance in Two-Stage Pipelines: Stage 1 Dominant">Figure 4</a> the processing time of
1318               <code class="code">T<sub>1</sub></code> is longer than
1319         <code class="code">T<sub>2</sub></code>; thus <code class="code">T<sub>2</sub></code> always
1320         waits for <code class="code">T<sub>1</sub></code> to write to the shared memory. 
1321         <a class="xref" href="#threads_timeline2" title="Thread Balance in Two-Stage Pipelines: Stage 2 Dominant">Figure 5</a> illustrates the scenario in which
1322         <code class="code">T<sub>1</sub></code> is faster and must wait for
1323            <code class="code">T<sub>2</sub></code> to finish reading the shared data before it can
1324         reuse the memory space. </p>
1325<p id="idp651744">
1326        <div class="figure" id="threads_timeline1">
1327<p class="title">Figure 4: Thread Balance in Two-Stage Pipelines: Stage 1 Dominant</p>
1328<div class="figure-contents"><div class="mediaobject" id="idp653120"><img alt="png image (threads_timeline1.png)" src="threads_timeline1.png" width="500cm"></div></div>
1330        <div class="figure" id="threads_timeline2">
1331<p class="title">Figure 5: Thread Balance in Two-Stage Pipelines: Stage 2 Dominant</p>
1332<div class="figure-contents"><div class="mediaobject" id="idp656128"><img alt="png image (threads_timeline2.png)" src="threads_timeline2.png" width="500cm"></div></div>
1334      </p>
1335<p id="idp658160"> Overall, our design is intended to benefit a range of applications. Conceptually, we
1336         consider two design points. The first, the parsing performed by the Parabix Subsystem
1337         dominates at 67% of the overall cost, with the cost of application processing (including
1338         the driver logic within the Markup Processor) at 33%. The second is almost the opposite
1339         scenario, the cost of application processing dominates at 60%, while the cost of XML
1340         parsing represents an overhead of 40%. </p>
1341<p id="idp659072"> Our design is predicated on a goal of using the Parabix framework to achieve a 50% to
1342         100% improvement in the parsing engine itself. In a best case scenario, a 100% improvement
1343         of the Parabix Subsystem for the design point in which XML parsing dominates at 67% of the
1344         total application cost. In this case, the single-threaded icXML should achieve a 1.5x
1345         speedup over Xerces so that the total application cost reduces to 67% of the original.
1346         However, in icXML-p, our ideal scenario gives us two well-balanced threads each performing
1347         about 33% of the original work. In this case, Amdahl's law predicts that we could expect up
1348         to a 3x speedup at best. </p>
1349<p id="idp660192"> At the other extreme of our design range, we consider an application in which core
1350         parsing cost is 40%. Assuming the 2x speedup of the Parabix Subsystem over the
1351         corresponding Xerces core, single-threaded icXML delivers a 25% speedup. However, the most
1352         significant aspect of our two-stage multi-threaded design then becomes the ability to hide
1353         the entire latency of parsing within the serial time required by the application. In this
1354         case, we achieve an overall speedup in processing time by 1.67x. </p>
1355<p id="idp661136"> Although the structure of the Parabix Subsystem allows division of the work into
1356         several pipeline stages and has been demonstrated to be effective for four pipeline stages
1357         in a research prototype <a class="xref" id="idp661616" href="javascript:showcite('cite-HPCA2012','idp661616')">Lin and Medforth 2012</a>, our analysis here suggests that the further
1358         pipelining of work within the Parabix Subsystem is not worthwhile if the cost of
1359         application logic is little as 33% of the end-to-end cost using Xerces. To achieve benefits
1360         of further parallelization with multi-core technology, there would need to be reductions in
1361         the cost of application logic that could match reductions in core parsing cost. </p>
1363<div class="section" id="performance">
1364<h2 class="title" style="clear: both">Performance</h2>
1365<p id="idp664000"> We evaluate Xerces-C++ 3.1.1, icXML, icXML-p against two benchmarking applications: the
1366         Xerces C++ SAXCount sample application, and a real world GML to SVG transformation
1367         application. We investigated XML parser performance using an Intel Core i7 quad-core (Sandy
1368         Bridge) processor (3.40GHz, 4 physical cores, 8 threads (2 per core), 32+32 kB (per core)
1369         L1 cache, 256 kB (per core) L2 cache, 8 MB L3 cache) running the 64-bit version of Ubuntu
1370         12.04 (Linux). </p>
1371<p id="idp664912"> We analyzed the execution profiles of each XML parser using the performance counters
1372         found in the processor. We chose several key hardware events that provide insight into the
1373         profile of each application and indicate if the processor is doing useful work. The set of
1374         events included in our study are: processor cycles, branch instructions, branch
1375         mispredictions, and cache misses. The Performance Application Programming Interface (PAPI)
1376         Version 5.5.0 <a class="xref" id="idp665680" href="javascript:showcite('cite-papi','idp665680')">PAPI</a> toolkit was installed on the test system to facilitate the
1377         collection of hardware performance monitoring statistics. In addition, we used the Linux
1378         perf <a class="xref" id="idp666608" href="javascript:showcite('cite-perf','idp666608')">perf</a> utility to collect per core hardware events. </p>
1379<div class="section" id="idp667552">
1380<h3 class="title" style="clear: both">Xerces C++ SAXCount</h3>
1381<p id="idp668192"> Xerces comes with sample applications that demonstrate salient features of the
1382            parser. SAXCount is the simplest such application: it counts the elements, attributes
1383            and characters of a given XML file using the (event based) SAX API and prints out the
1384            totals. </p>
1385<p id="idp668896"> <a class="xref" href="#XMLdocs">Table VIII</a> shows the document characteristics of the XML input files
1386            selected for the Xerces C++ SAXCount benchmark. The jaw.xml represents document-oriented
1387            XML inputs and contains the three-byte and four-byte UTF-8 sequence required for the
1388            UTF-8 encoding of Japanese characters. The remaining data files are data-oriented XML
1389            documents and consist entirely of single byte encoded ASCII characters.
1390  <div class="table-wrapper" id="XMLdocs">
1391<p class="title">Table VIII</p>
1392<div class="caption"><p id="idp671232">XML Document Characteristics</p></div>
1393<table class="table" xml:id="XMLdocs">
1394<colgroup span="1">
1395<col align="left" valign="top" span="1">
1396<col align="centre" valign="top" span="1">
1397<col align="centre" valign="top" span="1">
1398<col align="centre" valign="top" span="1">
1399<col align="centre" valign="top" span="1">
1403<td>File Name           </td>
1404<td> jaw.xml            </td>
1405<td> road.gml   </td>
1406<td> po.xml     </td>
1407<td> soap.xml </td>
1410<td>File Type           </td>
1411<td> document           </td>
1412<td> data               </td>
1413<td> data               </td>
1414<td> data        </td>
1417<td>File Size (kB)              </td>
1418<td> 7343                       </td>
1419<td> 11584      </td>
1420<td> 76450              </td>
1421<td> 2717 </td>
1424<td>Markup Item Count   </td>
1425<td> 74882              </td>
1426<td> 280724     </td>
1427<td> 4634110    </td>
1428<td> 18004 </td>
1431<td>Markup Density              </td>
1432<td> 0.13                       </td>
1433<td> 0.57       </td>
1434<td> 0.76               </td>
1435<td> 0.87       </td>
1441<p id="idp686832"> A key predictor of the overall parsing performance of an XML file is markup
1442           density<sup class="fn-label"><a href="#idp687200" class="footnoteref" id="idp687200-ref">[2]</a></sup>. This metric has substantial influence on the
1443            performance of traditional recursive descent XML parsers because it directly corresponds
1444            to the number of state transitions that occur when parsing a document. We use a mixture
1445            of document-oriented and data-oriented XML files to analyze performance over a spectrum
1446            of markup densities. </p>
1447<p id="idp688368"> <a class="xref" href="#perf_SAX" title="SAXCount Performance Comparison">Figure 6</a> compares the performance of Xerces, icXML and pipelined icXML
1448            in terms of CPU cycles per byte for the SAXCount application. The speedup for icXML over
1449            Xerces is 1.3x to 1.8x. With two threads on the multicore machine, icXML-p can achieve
1450            speedup up to 2.7x. Xerces is substantially slowed by dense markup but icXML is less
1451            affected through a reduction in branches and the use of parallel-processing techniques.
1452            icXML-p performs better as markup-density increases because the work performed by each
1453            stage is well balanced in this application. </p>
1454<p id="idp690112">
1455        <div class="figure" id="perf_SAX">
1456<p class="title">Figure 6: SAXCount Performance Comparison</p>
1457<div class="figure-contents">
1458<div class="mediaobject" id="idp691424"><img alt="png image (perf_SAX.png)" src="perf_SAX.png" width="500cm"></div>
1459<div class="caption"></div>
1462         </p>
1464<div class="section" id="idp693968">
1465<h3 class="title" style="clear: both">GML2SVG</h3>
1466<p id="idp694640">       As a more substantial application of XML processing, the GML-to-SVG (GML2SVG) application
1467was chosen.   This application transforms geospatially encoded data represented using
1468an XML representation in the form of Geography Markup Language (GML) <a class="xref" id="idp695168" href="javascript:showcite('cite-lake2004geography','idp695168')">Lake and Burggraf 2004</a> 
1469into a different XML format  suitable for displayable maps:
1470Scalable Vector Graphics (SVG) format <a class="xref" id="idp696064" href="javascript:showcite('cite-lu2007advances','idp696064')">Lu and Dos Santos 2007</a>. In the GML2SVG benchmark, GML feature elements
1471and GML geometry elements tags are matched. GML coordinate data are then extracted
1472and transformed to the corresponding SVG path data encodings.
1473Equivalent SVG path elements are generated and output to the destination
1474SVG document.  The GML2SVG application is thus considered typical of a broad
1475class of XML applications that parse and extract information from
1476a known XML format for the purpose of analysis and restructuring to meet
1477the requirements of an alternative format.</p>
1478<p id="idp697440">Our GML to SVG data translations are executed on GML source data
1479modelling the city of Vancouver, British Columbia, Canada.
1480The GML source document set
1481consists of 46 distinct GML feature layers ranging in size from approximately 9 KB to 125.2 MB
1482and with an average document size of 18.6 MB. Markup density ranges from approximately 0.045 to 0.719
1483and with an average markup density of 0.519. In this performance study,
1484213.4 MB of source GML data generates 91.9 MB of target SVG data.</p>
1485<div class="figure" id="perf_GML2SVG">
1486<p class="title">Figure 7: Performance Comparison for GML2SVG</p>
1487<div class="figure-contents">
1488<div class="mediaobject" id="idp699392"><img alt="png image (Throughput.png)" src="Throughput.png" width="500cm"></div>
1489<div class="caption"></div>
1492<p id="idp701680"><a class="xref" href="#perf_GML2SVG" title="Performance Comparison for GML2SVG">Figure 7</a> compares the performance of the GML2SVG application linked against
1493the Xerces, icXML and icXML-p.   
1494On the GML workload with this application, single-thread icXML
1495achieved about a 50% acceleration over Xerces,
1496increasing throughput on our test machine from 58.3 MB/sec to 87.9 MB/sec.   
1497Using icXML-p, a further throughput increase to 111 MB/sec was recorded,
1498approximately a 2X speedup.</p>
1499<p id="idp703088">An important aspect of icXML is the replacement of much branch-laden
1500sequential code inside Xerces with straight-line SIMD code using far
1501fewer branches.  <a class="xref" href="#branchmiss_GML2SVG" title="Comparative Branch Misprediction Rate">Figure 8</a> shows the corresponding
1502improvement in branching behaviour, with a dramatic reduction in branch misses per kB.
1503It is also interesting to note that icXML-p goes even further.   
1504In essence, in using pipeline parallelism to split the instruction
1505stream onto separate cores, the branch target buffers on each core are
1506less overloaded and able to increase the successful branch prediction rate.</p>
1507<div class="figure" id="branchmiss_GML2SVG">
1508<p class="title">Figure 8: Comparative Branch Misprediction Rate</p>
1509<div class="figure-contents">
1510<div class="mediaobject" id="idp705920"><img alt="png image (BM.png)" src="BM.png" width="500cm"></div>
1511<div class="caption"></div>
1514<p id="idp708208">The behaviour of the three versions with respect to L1 cache misses per kB is shown
1515in <a class="xref" href="#cachemiss_GML2SVG" title="Comparative Cache Miss Rate">Figure 9</a>.   Improvements are shown in both instruction-
1516and data-cache performance with the improvements in instruction-cache
1517behaviour the most dramatic.   Single-threaded icXML shows substantially improved
1518performance over Xerces on both measures.   
1519Although icXML-p is slightly worse with respect to data-cache performance,
1520this is more than offset by a further dramatic reduction in instruction-cache miss rate.
1521Again partitioning the instruction stream through the pipeline parallelism model has
1522significant benefit.</p>
1523<div class="figure" id="cachemiss_GML2SVG">
1524<p class="title">Figure 9: Comparative Cache Miss Rate</p>
1525<div class="figure-contents">
1526<div class="mediaobject" id="idp711008"><img alt="png image (CM.png)" src="CM.png" width="500cm"></div>
1527<div class="caption"></div>
1530<p id="idp713296">One caveat with this study is that the GML2SVG application did not exhibit
1531a relative balance of processing between application code and Xerces library
1532code reaching the 33% figure.  This suggests that for this application and
1533possibly others, further separating the logical layers of the
1534icXML engine into different pipeline stages could well offer significant benefit.
1535This remains an area of ongoing work.</p>
1538<div class="section" id="conclusion">
1539<h2 class="title" style="clear: both">Conclusion and Future Work</h2>
1540<p id="idp715856"> This paper is the first case study documenting the significant performance benefits
1541         that may be realized through the integration of parallel bitstream technology into existing
1542         widely-used software libraries. In the case of the Xerces-C++ XML parser, the combined
1543         integration of SIMD and multicore parallelism was shown capable of dramatic producing
1544         dramatic increases in throughput and reductions in branch mispredictions and cache misses.
1545         The modified parser, going under the name icXML is designed to provide the full
1546         functionality of the original Xerces library with complete compatibility of APIs. Although
1547         substantial re-engineering was required to realize the performance potential of parallel
1548         technologies, this is an important case study demonstrating the general feasibility of
1549         these techniques. </p>
1550<p id="idp717136"> The further development of icXML to move beyond 2-stage pipeline parallelism is
1551         ongoing, with realistic prospects for four reasonably balanced stages within the library.
1552         For applications such as GML2SVG which are dominated by time spent on XML parsing, such a
1553         multistage pipelined parsing library should offer substantial benefits. </p>
1554<p id="idp717904"> The example of XML parsing may be considered prototypical of finite-state machines
1555         applications which have sometimes been considered "embarassingly
1556         sequential" and so difficult to parallelize that "nothing
1557         works." So the case study presented here should be considered an important data
1558         point in making the case that parallelization can indeed be helpful across a broad array of
1559         application types. </p>
1560<p id="idp719280"> To overcome the software engineering challenges in applying parallel bitstream
1561         technology to existing software systems, it is clear that better library and tool support
1562         is needed. The techniques used in the implementation of icXML and documented in this paper
1563         could well be generalized for applications in other contexts and automated through the
1564         creation of compiler technology specifically supporting parallel bitstream programming.
1565      </p> Given the success of the icXML development, there is a strong case for continued
1566            development of the Parabix framework as well as for the application of Parabix
1567            to other important XML technology stacks.   In particular, an important area for further
1568            work is to extend the benefits of SIMD and multicore parallelism to the acceleration
1569            of Java-based XML processors.
1570      <p id="idp720576"> 
1571      </p>
1573<div class="bibliography" id="idp721136">
1574<h2 class="title" style="clear:both">Bibliography</h2>
1575<p class="bibliomixed" id="CameronHerdyLin2008"><a href="#idp450560">[Parabix1 2008] </a>Cameron, Robert D., Herdy, Kenneth S. and Lin, Dan. High performance XML parsing using parallel bit stream technology. CASCON'08: Proc. 2008 conference of the center for advanced studies on collaborative research. Richmond Hill, Ontario, Canada. 2008.</p>
1576<p class="bibliomixed" id="papi"><a href="#idp665680">[PAPI] </a>Innovative Computing Laboratory, University of Texas. Performance Application Programming Interface.<a href="" class="link" target="_new"></a></p>
1577<p class="bibliomixed" id="perf"><a href="#idp666608">[perf] </a>Eranian, Stephane, Gouriou, Eric, Moseley, Tipp and Bruijn, Willem de. Linux kernel profiling with perf. <a href="" class="link" target="_new"></a></p>
1578<p class="bibliomixed" id="Cameron2008"><a href="#idp449712">[u8u16 2008] </a>Cameron, Robert D.. A case study in SIMD text processing with parallel bit streams: UTF-8 to UTF-16 transcoding. Proc. 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming. Salt Lake City, USA. 2008.</p>
1579<p class="bibliomixed" id="ParaDOM2009"><a href="#idp300080">[Shah and Rao 2009] </a>Shah, Bhavik, Rao, Praveen, Moon, Bongki and Rajagopalan, Mohan. A Data Parallel Algorithm for XML DOM Parsing. Database and XML Technologies. 2009.</p>
1580<p class="bibliomixed" id="XMLSSE42"><a href="#idp303232">[Lei 2008] </a>Lei, Zhai. XML Parsing Accelerator with Intel Streaming SIMD Extensions 4 (Intel SSE4). <a href="Intel%20Software%20Network" class="link" target="_new">Intel Software Network</a>.  2008.</p>
1581<p class="bibliomixed" id="Cameron2009"><a href="#idp304144">[Balisage 2009] </a>Cameron, Rob, Herdy, Ken and Amiri, Ehsan Amiri. Parallel Bit Stream Technology as a Foundation for XML Parsing Performance. Int'l Symposium on Processing XML Efficiently: Overcoming Limits on Space, Time, or Bandwidth. Montreal, Quebec, Canada.  2009.</p>
1582<p class="bibliomixed" id="HilewitzLee2006"><a href="#idp530320">[Hilewitz and Lee 2006] </a>Hilewitz, Yedidya and Lee, Ruby B.. Fast Bit Compression and Expansion with Parallel Extract and Parallel Deposit Instructions. ASAP '06: Proc. IEEE 17th Int'l Conference on Application-specific Systems, Architectures and Processors. Steamboat Springs, Colorado, USA.  2006.</p>
1583<p class="bibliomixed" id="Asanovic-EECS-2006-183"><a href="#idp510400">[Asanovic et al. 2006] </a>Asanovic, Krste and others. The Landscape of Parallel Computing Research: A View from Berkeley. EECS Department, University of California, Berkeley.  2006.</p>
1584<p class="bibliomixed" id="GRID2006"><a href="#idp284544">[Lu and Chiu 2006] </a>Lu, Wei, Chiu, Kenneth and Pan, Yinfei. A Parallel Approach to XML Parsing. Proceedings of the 7th IEEE/ACM International Conference on Grid Computing. Barcelona, Spain.  2006.</p>
1585<p class="bibliomixed" id="cameron-EuroPar2011"><a href="#idp304896">[Parabix2 2011] </a>Cameron, Robert D., Amiri, Ehsan, Herdy, Kenneth S., Lin, Dan, Shermer, Thomas C. and Popowich, Fred P.. Parallel Scanning with Bitstream Addition: An XML Case Study. Euro-Par 2011, LNCS 6853, Part II.  Bordeaux, Frane. 2011.</p>
1586<p class="bibliomixed" id="HPCA2012"><a href="#idp302288">[Lin and Medforth 2012] </a>Lin, Dan, Medforth, Nigel, Herdy, Kenneth S., Shriraman, Arrvindh and Cameron, Rob. Parabix: Boosting the efficiency of text processing on commodity processors. International Symposium on High-Performance Computer Architecture. New Orleans, LA. 2012.</p>
1587<p class="bibliomixed" id="HPCC2011"><a href="#idp299248">[You and Wang 2011] </a>You, Cheng-Han and Wang, Sheng-De. A Data Parallel Approach to XML Parsing and Query. 10th IEEE International Conference on High Performance Computing and Communications. Banff, Alberta, Canada. 2011.</p>
1588<p class="bibliomixed" id="E-SCIENCE2007"><a href="#idp297680">[Pan and Zhang 2007] </a>Pan, Yinfei, Zhang, Ying, Chiu, Kenneth and Lu, Wei. Parallel XML Parsing Using Meta-DFAs. International Conference on e-Science and Grid Computing.   Bangalore, India.  2007.</p>
1589<p class="bibliomixed" id="ICWS2008"><a href="#idp300944">[Pan and Zhang 2008a] </a>Pan, Yinfei, Zhang, Ying and Chiu, Kenneth. Hybrid Parallelism for XML SAX Parsing. IEEE International Conference on Web Services. Beijing, China.  2008.</p>
1590<p class="bibliomixed" id="IPDPS2008"><a href="#idp298432">[Pan and Zhang 2008b] </a>Pan, Yinfei, Zhang, Ying and Chiu, Kenneth. Simultaneous transducers for data-parallel XML parsing. International Parallel and Distributed Processing Symposium. Miami, Florida, USA.  2008.</p>
1591<p class="bibliomixed" id="HackersDelight"><a href="#idp529424">[Warren 2002] </a>Warren, Henry S.. Hacker's Delight. Addison-Wesley Professional. 2003.</p>
1592<p class="bibliomixed" id="lu2007advances"><a href="#idp696064">[Lu and Dos Santos 2007] </a>Lu, C.T., Dos Santos, R.F., Sripada, L.N. and Kou, Y.. Advances in GML for geospatial applications. Geoinformatica 11:131-157.  2007.</p>
1593<p class="bibliomixed" id="lake2004geography"><a href="#idp695168">[Lake and Burggraf 2004] </a>Lake, R., Burggraf, D.S., Trninic, M. and Rae, L.. Geography mark-up language (GML) [foundation for the geo-web]. Wiley.  Chichester.  2004.</p>
1595<div class="footnotes">
1596<br><hr width="100" align="left">
1597<div id="FSM" class="footnote"><p><sup class="fn-label"><a href="#FSM-ref" class="footnoteref">[1]</a></sup> Herein FSM applications are considered software systems whose
1598            behaviour is defined by the inputs, current state and the events associated with
1599              transitions of states.</p></div>
1600<div id="idp687200" class="footnote"><p><sup class="fn-label"><a href="#idp687200-ref" class="footnoteref">[2]</a></sup> Markup Density: the ratio of markup bytes used to define the structure
1601             of the document vs. its file size.</p></div>
1604<div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em; font-style: italic">Balisage Series on Markup Technologies</h3></div>
1608<div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em">
1609<i>Balisage:</i> <small>The Markup Conference</small>
Note: See TracBrowser for help on using the repository browser.