source: docs/Balisage13/Bal2013came0601/Bal2013came0601.html @ 3043

Last change on this file since 3043 was 3043, checked in by ksherdy, 7 years ago

Added other xslt script to DO_XSTL script.

File size: 72.2 KB
1<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "">
2<html lang="en">
4<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
6<link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
7<meta name="keywords" content="">
10<div id="balisage-header"><h1 style="text-align: right; font-family: serif; margin:0.25em">
11<i>Balisage:</i> <small>The Markup Conference</small>
13<html lang="en">
16<link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
17<meta name="generator" content="Balisage Conference Proceedings XSLT (v1.2)">
18<script type="text/javascript">
19    function toggle(folderID) {
20      folder = document.getElementById("folder-"+folderID);
21      icon = document.getElementById("icon-"+folderID)
22      // need to:
23      //   switch between 'none' and 'block'
24      //   switch between collapse and expand icons
25      if ( != "block") {
26 = "block";
27        icon.src = "minus.png" ;
28        icon.alt = "collapse" ;
29      }
30      else {
31 = "none";
32        icon.src = "plus.png" ;
33        icon.alt = "expand" ;
34      };
35      return;
36    }
38   function hidecite(citeID) {
39     cite = document.getElementById(citeID);
40 = "none";
41     return;
42   }
44   function showcite(citeID,anchorID) {
45     cite = document.getElementById(citeID);
47     citeLeft =;
48     citeTop =;
50     if (citeLeft != (getLeft(anchorID)+"px") ||
51         citeTop != (getTop(anchorID)+"px")) {
52 = "none";
53     }
55     if ( != "table-cell") {
56        movebox(citeID, anchorID);
57 = "table-cell";
58     }
59     else {
60 = "none";
61     };
62     return;
63   }
65   function movebox(citeID, anchorID) {
67     cite = document.getElementById(citeID);
69     // alert(cite.offsetWidth + " by " + cite.offsetHeight)
71     horizontalOffset = getLeft(anchorID);
72     // horizontalOffset = (inMain(anchorID)) ?
73     // (horizontalOffset - 260) : (horizontalOffset + 20)
74     // (horizontalOffset - (20 + cite.offsetWidth)) : (horizontalOffset + 20)
76     verticalOffset = getTop(anchorID);
77     // verticalOffset = (inMain(anchorID)) ?
78     // (verticalOffset - 20) : (verticalOffset + 20)
79     // (verticalOffset - (20 + cite.offsetHeight)) : (verticalOffset + 20)
81     /*
82     horizontalOffset = getAbsoluteLeft(anchorID) - getScrollLeft(anchorID) + 20;
83     if (inMain(anchorID)) {
84       horizontalOffset = horizontalOffset - 300;
85     }
86     verticalOffset = getAbsoluteTop(anchorID) - getScrollTop(anchorID) - 40;
87     if (inMain(anchorID)) {
88       verticalOffset = verticalOffset - 300;
89     }
90     */
92 = horizontalOffset + "px";
93 = verticalOffset + "px";
94   }
96   function getLeft(objectID) {
97     var left = getAbsoluteLeft(objectID) - getScrollLeft(objectID);
98     left = (inMain(objectID)) ? (left - 260) : (left + 20)   
99     return left;
100   }
102   function getTop(objectID) {
103     var top = getAbsoluteTop(objectID) - getScrollTop(objectID);
104     top = (inMain(objectID)) ? (top - 50) : (top + 20)
105     return top;     
106   }
108   function getAbsoluteLeft(objectId) {
109   // Get an object left position from the upper left viewport corner
110     o = document.getElementById(objectId)
111     oLeft = o.offsetLeft            // Get left position from the parent object
112     while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
113       oParent = o.offsetParent    // Get parent object reference
114       oLeft += oParent.offsetLeft // Add parent left position
115       o = oParent
116      }
117    return oLeft
118    }
120    function getAbsoluteTop(objectId) {
121    // Get an object top position from the upper left viewport corner
122      o = document.getElementById(objectId)
123      oTop = o.offsetTop            // Get top position from the parent object
124      while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
125        oParent = o.offsetParent  // Get parent object reference
126        oTop += oParent.offsetTop // Add parent top position
127        o = oParent
128      }
129    return oTop
130    }
132   function getScrollLeft(objectId) {
133     // Get a left scroll position
134     o = document.getElementById(objectId)
135     oLeft = o.scrollLeft            // Get left position from the parent object
136     while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
137       oParent = o.offsetParent    // Get parent object reference
138       oLeft += oParent.scrollLeft // Add parent left position
139       o = oParent
140      }
141    return oLeft
142    }
144    function getScrollTop(objectId) {
145    // Get a right scroll position
146      o = document.getElementById(objectId)
147      oTop = o.scrollTop            // Get top position from the parent object
148      while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
149        oParent = o.offsetParent  // Get parent object reference
150        oTop += oParent.scrollTop // Add parent top position
151        o = oParent
152      }
153    return oTop
154    }
156    function inMain(objectId) {
157    // returns true if in div#main
158      o = document.getElementById(objectId)
159      while(o.parentNode != null) { // Parse the parent hierarchy up to div#main
160        oParent = o.parentNode
161        if ( == "main") { return true; }
162        o = oParent;
163      }
164    return false;
165    }
168   /*
169   function showcite(citeID) {
170      cite = document.getElementById(citeID);
171      if ( != "table-cell") {
172 = "table-cell";
173      }
174      else {
175 = "none";
176      };
177      return;
178    }
179    */
181      </script>
184<div class="inline-citation" id="cite-XMLChip09" style="display:none;width: 240px">
185<a class="quiet" href="javascript:hidecite('cite-XMLChip09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Leventhal, Michael and
186         Eric Lemoine 2009. The XML chip at 6 years. Proceedings of International Symposium on
187         Processing XML Efficiently 2009, Montréal.</p>
189<div class="inline-citation" id="cite-Datapower09" style="display:none;width: 240px">
190<a class="quiet" href="javascript:hidecite('cite-Datapower09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Salz, Richard,
191         Heather Achilles, and David Maze. 2009. Hardware and software trade-offs in the IBM
192         DataPower XML XG4 processor card. Proceedings of International Symposium on Processing XML
193         Efficiently 2009, Montréal.</p>
195<div class="inline-citation" id="cite-PPoPP08" style="display:none;width: 240px">
196<a class="quiet" href="javascript:hidecite('cite-PPoPP08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D. 2007. A Case Study
197         in SIMD Text Processing with Parallel Bit Streams UTF-8 to UTF-16 Transcoding. Proceedings
198         of 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming 2008, Salt
199         Lake City, Utah. On the Web at <a href="" class="link" target="_new"></a>.</p>
201<div class="inline-citation" id="cite-CASCON08" style="display:none;width: 240px">
202<a class="quiet" href="javascript:hidecite('cite-CASCON08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D.,
203         Kenneth S Herdy, and Dan Lin. 2008. High Performance XML Parsing Using Parallel Bit Stream
204         Technology. Proceedings of CASCON 2008. 13th ACM SIGPLAN Symposium on Principles and
205         Practice of Parallel Programming 2008, Toronto.</p>
207<div class="inline-citation" id="cite-SVGOpen08" style="display:none;width: 240px">
208<a class="quiet" href="javascript:hidecite('cite-SVGOpen08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Herdy, Kenneth
209         S., Robert D. Cameron and David S. Burggraf. 2008. High Performance GML to SVG
210         Transformation for the Visual Presentation of Geographic Data in Web-Based Mapping Systems.
211         Proceedings of SVG Open 6th International Conference on Scalable Vector Graphics,
212         Nuremburg. On the Web at
213            <a href="" class="link" target="_new"></a>.</p>
215<div class="inline-citation" id="cite-Ross06" style="display:none;width: 240px">
216<a class="quiet" href="javascript:hidecite('cite-Ross06')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Ross, Kenneth A. 2006. Efficient hash
217         probes on modern processors. Proceedings of ICDE, 2006. ICDE 2006, Atlanta. On the Web at
218            <a href="" class="link" target="_new"></a>.</p>
220<div class="inline-citation" id="cite-ASPLOS09" style="display:none;width: 240px">
221<a class="quiet" href="javascript:hidecite('cite-ASPLOS09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D. and Dan
222         Lin. 2009. Architectural Support for SWAR Text Processing with Parallel Bit Streams: The
223         Inductive Doubling Principle. Proceedings of ASPLOS 2009, Washington, DC.</p>
225<div class="inline-citation" id="cite-Wu08" style="display:none;width: 240px">
226<a class="quiet" href="javascript:hidecite('cite-Wu08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Wu, Yu, Qi Zhang, Zhiqiang Yu and
227         Jianhui Li. 2008. A Hybrid Parallel Processing for XML Parsing and Schema Validation.
228         Proceedings of Balisage 2008, Montréal. On the Web at
229            <a href="" class="link" target="_new"></a>.</p>
231<div class="inline-citation" id="cite-u8u16" style="display:none;width: 240px">
232<a class="quiet" href="javascript:hidecite('cite-u8u16')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">u8u16 - A High-Speed UTF-8 to UTF-16
233         Transcoder Using Parallel Bit Streams Technical Report 2007-18. 2007. School of Computing
234         Science Simon Fraser University, June 21 2007.</p>
236<div class="inline-citation" id="cite-XML10" style="display:none;width: 240px">
237<a class="quiet" href="javascript:hidecite('cite-XML10')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Extensible Markup Language (XML) 1.0 (Fifth
238         Edition) W3C Recommendation 26 November 2008. On the Web at
239            <a href="" class="link" target="_new"></a>.</p>
241<div class="inline-citation" id="cite-Unicode" style="display:none;width: 240px">
242<a class="quiet" href="javascript:hidecite('cite-Unicode')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">The Unicode Consortium. 2009. On the Web at
243            <a href="" class="link" target="_new"></a>.</p>
245<div class="inline-citation" id="cite-Pex06" style="display:none;width: 240px">
246<a class="quiet" href="javascript:hidecite('cite-Pex06')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> Hilewitz, Y. and Ruby B. Lee.
247         2006. Fast Bit Compression and Expansion with Parallel Extract and Parallel Deposit
248         Instructions. Proceedings of the IEEE 17th International Conference on Application-Specific
249         Systems, Architectures and Processors (ASAP), pp. 65-72, September 11-13, 2006.</p>
251<div class="inline-citation" id="cite-InfoSet" style="display:none;width: 240px">
252<a class="quiet" href="javascript:hidecite('cite-InfoSet')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">XML Information Set (Second Edition) W3C
253         Recommendation 4 February 2004. On the Web at
254         <a href="" class="link" target="_new"></a>.</p>
256<div class="inline-citation" id="cite-Saxon" style="display:none;width: 240px">
257<a class="quiet" href="javascript:hidecite('cite-Saxon')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">SAXON The XSLT and XQuery Processor. On the Web
258         at <a href="" class="link" target="_new"></a>.</p>
260<div class="inline-citation" id="cite-Kay08" style="display:none;width: 240px">
261<a class="quiet" href="javascript:hidecite('cite-Kay08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> Kay, Michael Y. 2008. Ten Reasons Why Saxon
262         XQuery is Fast, IEEE Data Engineering Bulletin, December 2008.</p>
264<div class="inline-citation" id="cite-AElfred" style="display:none;width: 240px">
265<a class="quiet" href="javascript:hidecite('cite-AElfred')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> The Ælfred XML Parser. On the Web at
266            <a href="" class="link" target="_new"></a>.</p>
268<div class="inline-citation" id="cite-JNI" style="display:none;width: 240px">
269<a class="quiet" href="javascript:hidecite('cite-JNI')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Hitchens, Ron. Java NIO. O'Reilly, 2002.</p>
271<div class="inline-citation" id="cite-Expat" style="display:none;width: 240px">
272<a class="quiet" href="javascript:hidecite('cite-Expat')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">The Expat XML Parser.
273            <a href="" class="link" target="_new"></a>.</p>
275<div id="mast"><div class="content">
276<h2 class="article-title" id="idp543912"></h2>
277<div class="author">
278<h3 class="author">Nigel Medforth</h3>
279<div class="affiliation">
280<p class="jobtitle">Developer</p>
281<p class="orgname">International Characters Inc.</p>
283<div class="affiliation">
284<p class="jobtitle">Graduate Student, School of Computing Science</p>
285<p class="orgname">Simon Fraser University </p>
287<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
289<div class="author">
290<h3 class="author">Dan Lin</h3>
291<div class="affiliation">
292<p class="jobtitle"></p>
293<p class="orgname"></p>
295<h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:"></a>&gt;</code></h5>
297<div class="author">
298<h3 class="author">Kenneth Herdy</h3>
299<div class="affiliation">
300<p class="jobtitle">Graduate Student, School of Computing Science</p>
301<p class="orgname">Simon Fraser University </p>
303<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
305<div class="author">
306<h3 class="author">Rob Cameron</h3>
307<div class="affiliation">
308<p class="jobtitle">Professor of Computing Science</p>
309<p class="orgname">Simon Fraser University</p>
311<div class="affiliation">
312<p class="jobtitle">Chief Technology Officer</p>
313<p class="orgname">International Characters, Inc.</p>
315<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
317<div class="author">
318<h3 class="author">Arrvindh Shriraman</h3>
319<div class="affiliation">
320<p class="jobtitle"></p>
321<p class="orgname"></p>
323<h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:"></a>&gt;</code></h5>
325<div class="mast-box">
326<p class="title"><a href="javascript:toggle('idp544272')" class="quiet"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp544272"></a> <span onclick="javascript:toggle('idp544272');return true">Abstract</span></p>
327<div class="folder" id="folder-idp544272" style="display:none"><p id="idp544728">Prior research on the acceleration of XML processing
328using SIMD and multi-core parallelism has lead to
329a number of interesting research prototypes.  This work
330investigates the extent to which the techniques underlying
331these prototypes could result in systematic performance
332benefits when fully integrated into a commercial XML parser.
333The widely used Xerces-C++ parser of the Apache Software
334Foundation was chosen as the foundation for the study.
335A systematic restructuring of the parser was undertaken,
336while maintaining the existing API for application programmers.
337Using SIMD techniques alone, an increase in parsing speed
338of at least 50% was observed in a range of applications.
339When coupled with pipeline parallelism on dual core processors,
340improvements of 2x and beyond were realized.
343<div class="toc">
344<p><b>Table of Contents</b></p>
346<dt><span class="section"><a href="#idp531824" class="toc">Introduction</a></span></dt>
347<dt><span class="section"><a href="#idp532720" class="toc">Background</a></span></dt>
349<dt><span class="section"><a href="#idp533040" class="toc">Xerces C++ Structure</a></span></dt>
350<dt><span class="section"><a href="#idp539296" class="toc">The Parabix Framework</a></span></dt>
351<dt><span class="section"><a href="#idp690352" class="toc">Sequential vs. Parallel Paradigm</a></span></dt>
353<dt><span class="section"><a href="#idp693304" class="toc">Architecture</a></span></dt>
355<dt><span class="section"><a href="#idp693624" class="toc">Overview</a></span></dt>
356<dt><span class="section"><a href="#idp706920" class="toc">Character Set Adapters</a></span></dt>
357<dt><span class="section"><a href="#idp509944" class="toc">Combined Parallel Filtering</a></span></dt>
358<dt><span class="section"><a href="#idp733168" class="toc">Content Stream</a></span></dt>
359<dt><span class="section"><a href="#idp740320" class="toc">Namespace Handling</a></span></dt>
360<dt><span class="section"><a href="#idp751920" class="toc">Error Handling</a></span></dt>
362<dt><span class="section"><a href="#idp758960" class="toc">Multithreading with Pipeline Parallelism</a></span></dt>
363<dt><span class="section"><a href="#idp769752" class="toc">Performance</a></span></dt>
365<dt><span class="section"><a href="#idp771632" class="toc">Xerces C++ SAXCount</a></span></dt>
366<dt><span class="section"><a href="#idp776016" class="toc">GML2SVG</a></span></dt>
368<dt><span class="section"><a href="#idp776592" class="toc">Conclusion and Future Work</a></span></dt>
371<div class="mast-box">
372<p class="title"><a href="javascript:toggle('idp545808')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp545808"></a> <span onclick="javascript:toggle('idp545808');return true">Nigel Medforth</span></p>
373<div class="folder" id="folder-idp545808" style="display:none">
374<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
375<div class="affiliation">
376<p class="jobtitle">Developer</p>
377<p class="orgname">International Characters Inc.</p>
379<div class="affiliation">
380<p class="jobtitle">Graduate Student, School of Computing Science</p>
381<p class="orgname">Simon Fraser University </p>
383<div class="personblurb">
384<p id="idp546720">Nigel Medforth is a M.Sc. student at Simon Fraser University and the lead developer of icXML.
385             He earned a Bachelor of Technology in Information Technology at Kwantlen Polytechnic University in 2009
386             and was awarded the Dean’s Medal for Outstanding Achievement.</p>
387<p id="idp547496">Nigel is currently researching ways to leverage both the Parabix framework and stream-processing models
388             to further accelerate XML parsing within icXML.</p>
392<div class="mast-box">
393<p class="title"><a href="javascript:toggle('idp525576')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp525576"></a> <span onclick="javascript:toggle('idp525576');return true">Kenneth Herdy</span></p>
394<div class="folder" id="folder-idp525576" style="display:none">
395<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
396<div class="affiliation">
397<p class="jobtitle">Graduate Student, School of Computing Science</p>
398<p class="orgname">Simon Fraser University </p>
400<div class="personblurb">
401<p id="idp526440"> Ken Herdy completed an Advanced Diploma of Technology in Geographical Information
402               Systems at the British Columbia Institute of Technology in 2003 and earned a Bachelor
403               of Science in Computing Science with a Certificate in Spatial Information Systems at
404               Simon Fraser University in 2005.
405                                                </p>
406<p id="idp526976"> Ken is currently pursuing PhD studies in Computing Science at Simon Fraser
407               University with industrial scholarship support from the Natural Sciences and
408               Engineering Research Council of Canada, the Mathematics of Information Technology and
409               Complex Systems NCE, and the BC Innovation Council. His research focus is an analysis
410               of the principal techniques that may be used to improve XML processing performance in
411               the context of the Geography Markup Language (GML).
412                                                </p>
416<div class="mast-box">
417<p class="title"><a href="javascript:toggle('idp528720')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp528720"></a> <span onclick="javascript:toggle('idp528720');return true">Rob Cameron</span></p>
418<div class="folder" id="folder-idp528720" style="display:none">
419<h5 class="author-email"><code class="email">&lt;<a class="email" href=""></a>&gt;</code></h5>
420<div class="affiliation">
421<p class="jobtitle">Professor of Computing Science</p>
422<p class="orgname">Simon Fraser University</p>
424<div class="affiliation">
425<p class="jobtitle">Chief Technology Officer</p>
426<p class="orgname">International Characters, Inc.</p>
428<div class="personblurb"><p id="idp1352">Dr. Rob Cameron is Professor of Computing Science and Associate Dean of
429             Applied Sciences at Simon Fraser
430                University.   His research interests include programming language
431                and software system technology, with a specific focus on high performance
432                text processing using SIMD and multicore parallelism.  He is the developer
433                of the REX XML shallow parser as well as the parallel bit stream (Parabix)
434                framework for SIMD text processing. 
435              </p></div>
439<div id="navbar"></div>
440<div id="balisage-header" style="background-color: #6699CC">
441<a class="quiet" href=""><img style="float:right;border:none" alt="Balisage logo" height="130" src=""></a><h2 class="page-header">Balisage: The Markup Conference</h2>
442<h1 class="page-header">Proceedings preview</h1>
444<div id="main">
445<div class="article">
446<h2 class="article-title" id="idp543912"></h2>
447<div class="section" id="idp531824">
448<h2 class="title" style="clear: both">Introduction</h2>
449<p id="idp532144"></p>
450<p id="idp532272"></p>
451<p id="idp532400"></p>
452<p id="idp532528"></p>
454<div class="section" id="idp532720">
455<h2 class="title" style="clear: both">Background</h2>
456<div class="section" id="idp533040">
457<h3 class="title" style="clear: both">Xerces C++ Structure</h3>
458<p id="idp533360">
459The Xerces C++ parser
464features comprehensive support for a variety of character encodings
465both commonplace (e.g., UTF-8, UTF-16) and rarely used (e.g., EBCDIC), support for
466multiple XML vocabularies through the XML namespace
467mechanism, as well as complete implementations
468of structure and data validation through multiple grammars
469declared using either legacy DTDs (document type
470definitions) or modern XML Schema facilities.
471Xerces also supports several APIs for accessing
472parser services, including event-based parsing
473using either pull parsing or SAX/SAX2 push-style
474parsing as well as a DOM tree-based parsing interface.
476<p id="idp534960">
480Xerces, like all traditional parsers, processes XML documents sequentially a byte-at-a-time from the
481first to the last byte of input data. Each byte passes through several processing layers and is
482classified and eventually validated within the context of the document state.
483This introduces implicit dependencies between the various tasks within the application that make it
484difficult to optimize for performance.
485As a complex software system, no one feature dominates the overall parsing performance.
486Figure \ref{fig:xerces-profile} shows the execution time profile of the top ten functions in a typical run.
487Even if it were possible, Amdahl's Law dictates that tackling any one of these functions for
488parallelization in isolation would only produce a minute improvement in performance.
489Unfortunately, early investigation into these functions found
490that incorporating speculation-free thread-level parallelization was impossible
491and they were already performing well in their given tasks;
492thus only trivial enhancements were attainable.
493In order to obtain a systematic acceleration of Xerces,
494it should be expected that a comprehensive restructuring
495is required, involving all aspects of the parser.
497<p id="idp538344">
504<div class="section" id="idp539296">
505<h3 class="title" style="clear: both">The Parabix Framework</h3>
506<p id="idp539616">
507The Parabix (parallel bit stream) framework is a transformative approach to XML parsing
508(and other forms of text processing.) The key idea is to exploit the availability of wide
509SIMD registers (e.g., 128-bit) in commodity processors to represent data from long blocks
510of input data by using one register bit per single input byte.
511To facilitate this, the input data is first transposed into a set of basis bit streams.
514Boolean-logic operations\footnote{∧, \√ and ¬ denote the boolean AND, OR and NOT operators.}
515are used to classify the input bits into a set of <span class="ital">character-class bit streams</span>, which identify key
516characters (or groups of characters) with a <code class="code">1</code>.
517For example, one of the fundamental characters in XML is a left-angle bracket.
518A character is an <code class="code">'&lt;' if and only if ¬(b<sub>0</sub> âˆš b<sub>1</sub>) ∧ (b<sub>2</sub> âˆ§
519b<sub>3</sub>) ∧ (b<sub>4</sub> âˆ§ b<sub>5</sub>) ∧ ¬ (b<sub>6</sub> âˆš b<sub>7</sub>) = 1</code>.
520Similarly, a character is numeric,
521<code class="code">[0-9] if and only if ¬(b<sub>0</sub> âˆš b<sub>1</sub>) ∧ (b<sub>2</sub> âˆ§ b<sub>3</sub>) ∧ ¬(b<sub>4</sub> âˆ§ (b<sub>5</sub> âˆš b<sub>6</sub>))</code>.
522An important observation here is that ranges of characters may
523require fewer operations than individual characters and
525multiple classes can share the classification cost.
527<p id="idp682296">
530<p id="idp683752">
531Consider, for example, the XML source data stream shown in the first line of .
532The remaining lines of this figure show several parallel bit streams that are computed in Parabix-style
533parsing, with each bit of each stream in one-to-one correspondence to the source character code units
534of the input stream.
535For clarity, 1 bits are denoted with 1 in each stream and 0 bits are represented as underscores.
536The first bit stream shown is that for the opening
537angle brackets that represent tag openers in XML.
538The second and third streams show a partition of the
539tag openers into start tag marks and end tag marks
540depending on the character immediately following the
541opener (i.e., <code class="code">"/"</code>) or not.  The remaining three
542lines show streams that can be computed in subsequent
543parsing (using the technique
544of bitstream addition \cite{cameron-EuroPar2011}), namely streams marking the element names,
545attribute names and attribute values of tags. 
547<p id="idp687088">
548Two intuitions may help explain how the Parabix approach can lead
549to improved XML parsing performance. The first is that
550the use of the full register width offers a considerable
551information advantage over sequential byte-at-a-time
552parsing.  That is, sequential processing of bytes
553uses just 8 bits of each register, greatly limiting the
554processor resources that are effectively being used at any one time.
555The second is that byte-at-a-time loop scanning loops are actually
556often just computing a single bit of information per iteration:
557is the scan complete yet?
558Rather than computing these individual decision-bits, an approach that computes
559many of them in parallel (e.g., 128 bytes at a time using 128-bit registers)
560should provide substantial benefit.
562<p id="idp688048">
563Previous studies have shown that the Parabix approach improves many aspects of XML processing,
564including transcoding \cite{Cameron2008}, character classification and validation,
565tag parsing and well-formedness checking. 
566The first Parabix parser used processor bit scan instructions to considerably accelerate
567sequential scanning loops for individual characters \cite{CameronHerdyLin2008}.
568Recent work has incorporated a method of parallel
569scanning using bitstream addition \cite{cameron-EuroPar2011}, as
570well as combining SIMD methods with 4-stage pipeline parallelism to further improve
571throughput \cite{HPCA2012}.
572Although these research prototypes handled the full syntax of schema-less XML documents,
573they lacked the functionality required by full XML parsers.
575<p id="idp689800">
576Commercial XML processors support transcoding of multiple character sets and can parse and
577validate against multiple document vocabularies.
578Additionally, they provide API facilities beyond those found in research prototypes,
579including the widely used SAX, SAX2 and DOM interfaces.
582<div class="section" id="idp690352">
583<h3 class="title" style="clear: both">Sequential vs. Parallel Paradigm</h3>
584<p id="idp690672">
585Xerces—like all traditional XML parsers—processes XML documents sequentially.
586Each character is examined to distinguish between the
587XML-specific markup, such as a left angle bracket <code class="code">&lt;</code>, and the
588content held within the document. 
589As the parser progresses through the document, it alternates between markup scanning,
590validation and content processing modes.
592<p id="idp691744">
593In other words, Xerces belongs to an equivalent class applications termed FSM applications\footnote{
594  Herein FSM applications are considered software systems whose behaviour is defined by the inputs,
595  current state and the events associated with transitions of states.}.
596Each state transition indicates the processing context of subsequent characters.
597Unfortunately, textual data tends to be unpredictable and any character could induce a state transition.
599<p id="idp692408">
600Parabix-style XML parsers utilize a concept of layered processing.
601A block of source text is transformed into a set of lexical bitstreams,
602which undergo a series of operations that can be grouped into logical layers,
603e.g., transposition, character classification, and lexical analysis.
604Each layer is pipeline parallel and require neither speculation nor pre-parsing stages\cite{HPCA2012}.
605To meet the API requirements of the document-ordered Xerces output,
606the results of the Parabix processing layers must be interleaved to produce the equivalent behaviour.
610<div class="section" id="idp693304">
611<h2 class="title" style="clear: both">Architecture</h2>
612<div class="section" id="idp693624">
613<h3 class="title" style="clear: both">Overview</h3>
614<p id="idp694072">
615icXML is more than an optimized version of Xerces. Many components were grouped, restructured and
616rearchitected with pipeline parallelism in mind.
617In this section, we highlight the core differences between the two systems.
618As shown in Figure \ref{fig:xerces-arch}, Xerces
619is comprised of five main modules: the transcoder, reader, scanner, namespace binder, and validator.
620The <span class="ital">Transcoder</span> converts source data into UTF-16 before Xerces parses it as XML;
621the majority of the character set encoding validation is performed as a byproduct of this process.
622The <span class="ital">Reader</span> is responsible for the streaming and buffering of all raw and transcoded (UTF-16) text.
623It tracks the current line/column position,
625performs line-break normalization and validates context-specific character set issues,
626such as tokenization of qualified-names.
627The <span class="ital">Scanner</span> pulls data through the reader and constructs the intermediate representation (IR)
628of the document; it deals with all issues related to entity expansion, validates
629the XML well-formedness constraints and any character set encoding issues that cannot
630be completely handled by the reader or transcoder (e.g., surrogate characters, validation
631and normalization of character references, etc.)
632The <span class="ital">Namespace Binder</span> is a core piece of the element stack.
633It handles namespace scoping issues between different XML vocabularies.
634This allows the scanner to properly select the correct schema grammar structures.
635The <span class="ital">Validator</span> takes the IR produced by the Scanner (and
636potentially annotated by the Namespace Binder) and assesses whether the final output matches
637the user-defined DTD and schema grammar(s) before passing it to the end-user.
639<p id="idp697696">
642<p id="idp697952">
643In icXML functions are grouped into logical components.
644As shown in Figure \ref{fig:icxml-arch}, two major categories exist: (1) the Parabix Subsystem and (2) the Markup Processor.
645All tasks in (1) use the Parabix Framework \cite{HPCA2012}, which represents data as a set of parallel bitstreams.
646The <span class="ital">Character Set Adapter</span>, discussed in Section \ref{arch:character-set-adapter},
647mirrors Xerces's Transcoder duties; however instead of producing UTF-16 it produces a
648set of lexical bitstreams, similar to those shown in Figure \ref{fig:parabix1}.
649These lexical bitstreams are later transformed into UTF-16 in the Content Stream Generator,
650after additional processing is performed.
651The first precursor to producing UTF-16 is the <span class="ital">Parallel Markup Parser</span> phase.
652It takes the lexical streams and produces a set of marker bitstreams in which a 1-bit identifies
653significant positions within the input data. One bitstream for each of the critical piece of information is created, such as
654the beginning and ending of start tags, end tags, element names, attribute names, attribute values and content.
655Intra-element well-formedness validation is performed as an artifact of this process.
656Like Xerces, icXML must provide the Line and Column position of each error.
657The <span class="ital">Line-Column Tracker</span> uses the lexical information to keep track of the document position(s) through the use of an
658optimized population count algorithm, described in Section \ref{section:arch:errorhandling}.
659From here, two data-independent branches exist: the Symbol Resolver and Content Preparation Unit.
661<p id="idp700912">
662A typical XML file contains few unique element and attribute names—but each of them will occur frequently.
663icXML stores these as distinct data structures, called symbols, each with their own global identifier (GID).
664Using the symbol marker streams produced by the Parallel Markup Parser, the <span class="ital">Symbol Resolver</span> scans through
665the raw data to produce a sequence of GIDs, called the <span class="ital">symbol stream</span>.
667<p id="idp702552">
668The final components of the Parabix Subsystem are the <span class="ital">Content Preparation Unit</span> and <span class="ital">Content Stream Generator</span>.
669The former takes the (transposed) basis bitstreams and selectively filters them, according to the
670information provided by the Parallel Markup Parser, and the latter transforms the
671filtered streams into the tagged UTF-16 <span class="ital">content stream</span>, discussed in Section \ref{section:arch:contentstream}.
673<p id="idp704040">
674Combined, the symbol and content stream form icXML's compressed IR of the XML document.
675The <span class="ital">Markup Processor</span>~parses the IR to validate and produce the sequential output for the end user.
676The <span class="ital">Final WF checker</span> performs inter-element well-formedness validation that would be too costly
677to perform in bit space, such as ensuring every start tag has a matching end tag.
678Xerces's namespace binding functionality is replaced by the <span class="ital">Namespace Processor</span>. Unlike Xerces,
679it is a discrete phase that produces a series of URI identifiers (URI IDs), the <span class="ital">URI stream</span>, which are
680associated with each symbol occurrence.
681This is discussed in Section \ref{section:arch:namespacehandling}.
682Finally, the <span class="ital">Validation</span> layer implements the Xerces's validator.
683However, preprocessing associated with each symbol greatly reduces the work of this stage.
685<p id="idp706600">
689<div class="section" id="idp706920">
690<h3 class="title" style="clear: both">Character Set Adapters</h3>
691<p id="idp707568">
692In Xerces, all input is transcoded into UTF-16 to simplify the parsing costs of Xerces itself and
693provide the end-consumer with a single encoding format.
694In the important case of UTF-8 to UTF-16 transcoding, the transcoding costs can be significant,
695because of the need to decode and classify each byte of input, mapping variable-length UTF-8
696byte sequences into 16-bit UTF-16 code units with bit manipulation operations.   
697In other cases, transcoding may involve table look-up operations for each byte of input.  In any case,
698transcoding imposes at least a cost of buffer copying.
700<p id="idp708352">
701In icXML, however,  the concept of Character Set Adapters (CSAs) is used to minimize transcoding costs.
702Given a specified input encoding, a CSA is responsible for checking that
703input code units represent valid characters, mapping the characters of the encoding into
704the appropriate bitstreams for XML parsing actions (i.e., producing the lexical item
705streams), as well as supporting ultimate transcoding requirements.   All of this work
706is performed using the parallel bitstream representation of the source input.
708<p id="idp709072">
709An important observation is that many character sets are an
710extension to the legacy 7-bit ASCII character set.  This includes the
711various ISO Latin character sets, UTF-8, UTF-16 and many others.
712Furthermore, all significant characters for parsing XML are confined to the
713ASCII repertoire.   Thus, a single common set of lexical item calculations
714serves to compute lexical item streams for all such ASCII-based character sets.
716<p id="idp709704">
717A second observation is that—regardless of which character set is used—quite
718often all of the characters in a particular block of input will be within the ASCII range.
719This is a very simple test to perform using the bitstream representation, simply confirming that the
720bit 0 stream is zero for the entire block.   For blocks satisfying this test,
721all logic dealing with non-ASCII characters can simply be skipped.
722Transcoding to UTF-16 becomes trivial as the high eight bitstreams of the
723UTF-16 form are each set to zero in this case.
725<p id="idp711256">
726A third observation is that repeated transcoding of the names of XML
727elements, attributes and so on can be avoided by using a look-up mechanism.
728That is, the first occurrence of each symbol is stored in a look-up
729table mapping the input encoding to a numeric symbol ID.   Transcoding
730of the symbol is applied at this time.  Subsequent look-up operations
731can avoid transcoding by simply retrieving the stored representation.
732As symbol look up is required to apply various XML validation rules,
733there is achieves the effect of transcoding each occurrence without
734additional cost.
736<p id="idp712040">
737The cost of individual character transcoding is avoided whenever a block of input is
738confined to the ASCII subset and for all but the first occurrence of any XML element or attribute name.
739Furthermore, when transcoding is required, the parallel bitstream representation
740supports efficient transcoding operations.   
741In the important case of UTF-8 to UTF-16 transcoding, the corresponding UTF-16 bitstreams
742can be calculated in bit parallel fashion based on UTF-8 streams \cite{Cameron2008},
743and all but the final bytes of multi-byte sequences can be marked for deletion as
744discussed in the following subsection.
745In other cases, transcoding within a block only need be applied for non-ASCII
746bytes, which are conveniently identified by iterating through the bit 0 stream
747using bit scan operations.
750<div class="section" id="idp509944">
751<h3 class="title" style="clear: both">Combined Parallel Filtering</h3>
752<p id="idp510296">
753As just mentioned, UTF-8 to UTF-16 transcoding involves marking
754all but the last bytes of multi-byte UTF-8 sequences as
755positions for deletion.   For example, the two
756Chinese characters <code class="code">䜠奜</code>
757are represented as two three-byte UTF-8 sequences <code class="code">E4 BD A0</code>
758and <code class="code">E5 A5 BD</code> while the UTF-16 representation must be
759compressed down to the two code units <code class="code">4F60</code> and <code class="code">597D</code>.
760In the bit parallel representation, this corresponds to a reduction
761from six bit positions representing UTF-8 code units (bytes)
762down to just two bit positions representing UTF-16 code units
763(double bytes).   This compression may be achieved by
764arranging to calculate the correct UTF-16 bits at the
765final position of each sequence and creating a deletion
766mask to mark the first two bytes of each 3-byte sequence
767for deletion.   In this case, the portion of the mask
768corresponding to these input bytes is the bit sequence
769<code class="code">110110</code>.  Using this approach, transcoding may then be
770completed by applying parallel deletion and inverse transposition of the
771UTF-16 bitstreams\cite{Cameron2008}.
773<p id="idp512784">
784<p id="idp515240">
785Rather than immediately paying the
786costs of deletion and transposition just for transcoding,
787however, icXML defers these steps so that the deletion
788masks for several stages of processing may be combined.
789In particular, this includes core XML requirements
790to normalize line breaks and to replace character
791reference and entity references by their corresponding
792text.   In the case of line break normalization,
793all forms of line breaks, including bare carriage
794returns (CR), line feeds (LF) and CR-LF combinations
795must be normalized to a single LF character in
796each case.   In icXML, this is achieved by
797first marking CR positions, performing two
798bit parallel operations to transform the marked
799CRs into LFs, and then marking for deletion any
800LF that is found immediately after the marked CR
801as shown by the Pablo source code in Figure \ref{fig:LBnormalization}.
804<p id="idp517744">
805In essence, the deletion masks for transcoding and
806for line break normalization each represent a bitwise
807filter; these filters can be combined using bitwise-or
808so that the parallel deletion algorithm need only be
809applied once.
811<p id="idp517936">
812A further application of combined filtering
813is the processing of XML character and entity
814references.   Consider, for example, the references <code class="code">&amp;</code> or <code class="code">&lt;</code>.
815which must be replaced in XML processing with 
816the single <code class="code">&amp;</code> and <code class="code">&lt;</code> characters, respectively.
817The approach in icXML is to mark all but the first character
818positions of each reference for deletion, leaving a
819single character position unmodified.  Thus, for the
820references <code class="code">&amp;</code> or <code class="code">&lt;</code> the
821masks <code class="code">01111</code> and <code class="code">011111</code> are formed and
822combined into the overall deletion mask.   After the
823deletion and inverse transposition operations are finally
824applied, a post-processing step inserts the proper character
825at these positions.   One note about this process is
826that it is speculative; references are assumed to generally
827be replaced by a single UTF-16 code unit.   In the case,
828that this is not true, it is addressed in post-processing.
830<p id="idp732200">
831The final step of combined filtering occurs during
832the process of reducing markup data to tag bytes
833preceding each significant XML transition as described
834in section~\ref{section:arch:contentstream}.  Overall, icXML
835avoids separate buffer copying operations for each of the
836these filtering steps, paying the cost of parallel
837deletion and inverse transposition only once. 
838Currently, icXML employs the parallel-prefix compress algorithm
839of Steele~\cite{HackersDelight}  Performance
840is independent of the number of positions deleted.
841Future versions of icXML are expected to
842take advantage of the parallel extract operation~\cite{HilewitzLee2006}
843that Intel is now providing in its Haswell architecture.
846<div class="section" id="idp733168">
847<h3 class="title" style="clear: both">Content Stream</h3>
848<p id="idp733512">
849A relatively-unique concept for icXML is the use of a filtered content stream.
850Rather that parsing an XML document in its original format, the input is transformed
851into one that is easier for the parser to iterate through and produce the sequential
853In , the source data
855is transformed into
858through the parallel filtering algorithm, described in section \ref{sec:parfilter}.
860<p id="idp735224">
861Combined with the symbol stream, the parser traverses the content stream to effectively
862reconstructs the input document in its output form.
863The initial <span class="ital">0</span> indicates an empty content string. The following <code class="code">&gt;</code>
864indicates that a start tag without any attributes is the first element in this text and
865the first unused symbol, <code class="code">document</code>, is the element name.
866Succeeding that is the content string <code class="code">fee</code>, which is null-terminated in accordance
867with the Xerces API specification. Unlike Xerces, no memory-copy operations
868are required to produce these strings, which as Figure~\ref{fig:xerces-profile} shows
869accounts for 6.83% of Xerces's execution time.
870Additionally, it is cheap to locate the terminal character of each string:
871using the String End bitstream, the Parabix Subsystem can effectively calculate the offset of each
872null character in the content stream in parallel, which in turn means the parser can
873directly jump to the end of every string without scanning for it.
875<p id="idp737880">
876Following <code class="code">'fee'</code> is a <code class="code">=</code>, which marks the existence of an attribute.
877Because all of the intra-element was performed in the Parabix Subsystem, this must be a legal attribute.
878Since attributes can only occur within start tags and must be accompanied by a textual value,
879the next symbol in the symbol stream must be the element name of a start tag,
880and the following one must be the name of the attribute and the string that follows the <code class="code">=</code> must be its value.
881However, the subsequent <code class="code">=</code> is not treated as an independent attribute because the parser has yet to
882read a <code class="code">&gt;</code>, which marks the end of a start tag. Thus only one symbol is taken from the symbol stream and
883it (along with the string value) is added to the element.
884Eventually the parser reaches a <code class="code">/</code>, which marks the existence of an end tag. Every end tag requires an
885element name, which means they require a symbol. Inter-element validation whenever an empty tag is detected to
886ensure that the appropriate scope-nesting rules have been applied.
889<div class="section" id="idp740320">
890<h3 class="title" style="clear: both">Namespace Handling</h3>
891<p id="idp740968">
892In XML, namespaces prevents naming conflicts when multiple vocabularies are used together.
893It is especially important when a vocabulary application-dependant meaning, such as when
894XML or SVG documents are embedded within XHTML files.
895Namespaces are bound to uniform resource identifiers (URIs), which are strings used to identify
896specific names or resources.
897On line 1 of Figure \ref{fig:namespace1}, the <code class="code">xmlns</code> attribute instructs the XML
898processor to bind the prefix <code class="code">p</code> to the URI '<code class="code"></code>' and the default (empty)
899prefix to <code class="code"></code>. Thus to the XML processor, the <code class="code">title</code> on line 2 and
900<code class="code">price</code> on line 4 both read as <code class="code">"":title</code> and <code class="code">"":price</code> 
901respectively, whereas on line 3 and 5, <code class="code">p:name</code> and <code class="code">price</code> are seen as
902<code class="code">"":name</code> and <code class="code">"":price</code>. Even though the actual element name
903<code class="code">price</code>, due to namespace scoping rules they are viewed as two uniquely-named items
904because the current vocabulary is determined by the namespace(s) that are in-scope.
906<p id="idp744856">
909<p id="idp745128">
910In both Xerces and icXML, every URI has a one-to-one mapping to a URI ID.
911These persist for the lifetime of the application through the use of a global URI pool.
912Xerces maintains a stack of namespace scopes that is pushed (popped) every time a start tag (end tag) occurs
913in the document. Because a namespace declaration affects the entire element, it must be processed prior to
914grammar validation. This is a costly process considering that a typical namespaced XML document only comes
915in one of two forms:
916(1) those that declare a set of namespaces upfront and never change them, and
917(2) those that repeatedly modify the namespaces in predictable patterns.
919<p id="idp745320">
920For that reason, icXML contains an independent namespace stack and utilizes bit vectors to cheaply perform
923When a prefix is declared (e.g., <code class="code">xmlns:p=""</code>), a namespace binding is created that maps
924the prefix (which are assigned Prefix IDs in the symbol resolution process) to the URI.
925Each unique namespace binding has a unique namespace id (NSID) and every prefix contains a bit vector marking every
926NSID that has ever been associated with it within the document. For example, in Table \ref{tbl:namespace1}, the
927prefix binding set of <code class="code">p</code> and <code class="code">xmlns</code> would be <code class="code">01</code> and <code class="code">11</code> respectively.
928To resolve the in-scope namespace binding for each prefix, a bit vector of the currently visible namespaces is
929maintained by the system. By ANDing the prefix bit vector with the currently visible namespaces, the in-scope
930NSID can be found using a bit-scan intrinsic.
931A namespace binding table, similar to Table \ref{tbl:namespace1}, provides the actual URI ID.
933<p id="idp749096">
936<p id="idp749368">
942<p id="idp750936">
943To ensure that scoping rules are adhered to,
944whenever a start tag is encountered, any modification to the currently visible namespaces is calculated and stored
945within a stack of bit vectors denoting the locally modified namespace bindings. When an end tag is found, the
946currently visible namespaces is XORed with the vector at the top of the stack.
947This allows any number of changes to be performed at each scope-level with a constant time.
951<div class="section" id="idp751920">
952<h3 class="title" style="clear: both">Error Handling</h3>
953<p id="idp752264">
955Xerces outputs error messages in two ways: through the programmer API and as thrown objects for fatal errors.
956As Xerces parses a file, it uses context-dependant logic to assess whether the next character is legal;
957if not, the current state determines the type and severity of the error.
958icXML emits errors in the similar manner—but how it discovers them is substantially different.
959Recall that in Figure \ref{fig:icxml-arch}, icXML is divided into two sections: the Parabix Subsystem and Markup Processor,
960each with its own system for detecting and producing error messages.
962<p id="idp753400">
963Within the Parabix Subsystem, all computations are performed in parallel, a block at a time.
964Errors are derived as artifacts of bitstream calculations, with a 1-bit marking the byte-position of an error within a block,
965and the type of error is determined by the equation that discovered it.
966The difficulty of error processing in this section is that in Xerces the line and column number must be given
967with every error production. Two major issues exist because of this:
968(1) line position adheres to XML white-normalization rules; as such, some sequences of characters, e.g., a carriage return
969followed by a line feed, are counted as a single new line character.
970(2) column position is counted in characters, not bytes or code units;
971thus multi-code-unit code-points and surrogate character pairs are all counted as a single column position.
972Note that typical XML documents are error-free but the calculation of the
973line/column position is a constant overhead in Xerces.
974To reduce this, icXML pushes the bulk cost of the line/column calculation to the occurrence of the error and
975performs the minimal amount of book-keeping necessary to facilitate it.
976icXML leverages the byproducts of the Character Set Adapter (CSA) module and amalgamates the information
977within the Line Column Tracker (LCT).
978One of the CSA's major responsibilities is transcoding an input text.
979During this process, white-space normalization rules are applied and multi-code-unit and surrogate characters are detected
980and validated.
981A <span class="ital">line-feed bitstream</span>, which marks the positions of the normalized new lines characters, is a natural derivative of
982this process.
983Using an optimized population count algorithm, the line count can be summarized cheaply for each valid block of text.
985Column position is more difficult to calculate.
986It is possible to scan backwards through the bitstream of new line characters to determine the distance (in code-units)
987between the position between which an error was detected and the last line feed. However, this distance may exceed
988than the actual character position for the reasons discussed in (2).
989To handle this, the CSA generates a <span class="ital">skip mask</span> bitstream by ORing together many relevant bitstreams,
990such as all trailing multi-code-unit and surrogate characters, and any characters that were removed during the
991normalization process.
992When an error is detected, the sum of those skipped positions is subtracted from the distance to determine the actual
993column number.
995<p id="idp757256">
996The Markup Processor is a state-driven machine. As such, error detection within it is very similar to Xerces.
997However, reporting the correct line/column is a much more difficult problem.
998The Markup Processor parses the content stream, which is a series of tagged UTF-16 strings.
999Each string is normalized in accordance with the XML specification.
1000All symbol data and unnecessary whitespace is eliminated from the stream;
1001thus its impossible to derive the current location using only the content stream.
1002To calculate the location, the Markup Processor borrows three additional pieces of information from the Parabix Subsystem:
1003the line-feed, skip mask, and a <span class="ital">deletion mask stream</span>, which is a bitstream denoting the (code-unit) position of every
1004datum that was suppressed from the source during the production of the content stream.
1005Armed with these, it is possible to calculate the actual line/column using
1006the same system as the Parabix Subsystem until the sum of the negated deletion mask stream is equal to the current position.
1010<div class="section" id="idp758960">
1011<h2 class="title" style="clear: both">Multithreading with Pipeline Parallelism</h2>
1012<p id="idp759352">
1013As discussed in section \ref{background:xerces}, Xerces can be considered a FSM application.
1014These are "embarrassingly sequential."\cite{Asanovic:EECS-2006-183} and notoriously difficult to parallelize.
1015However, icXML is designed to organize processing into logical layers.   
1016In particular, layers within the Parabix Subsystem are designed to operate
1017over significant segments of input data before passing their outputs on for
1018subsequent processing.  This fits well into the general model of pipeline
1019parallelism, in which each thread is in charge of a single module or group
1020of modules.
1022<p id="idp760888">
1023The most straightforward division of work in icXML is to separate
1024the Parabix Subsystem and the Markup Processor into distinct logical layers into two separate stages.
1025The resultant application, <span class="ital">icXML-p</span>, is a course-grained software-pipeline application.
1026In this case, the Parabix Subsystem thread <code class="code">T<sub>1</sub></code> reads 16k of XML input <code class="code">I</code> at a time and produces the
1027content, symbol and URI streams, then stores them in a pre-allocated shared data structure <code class="code">S</code>.
1028The Markup Processor thread <code class="code">T<sub>2</sub></code> consumes <code class="code">S</code>, performs well-formedness and grammar-based validation,
1029and the provides parsed XML data to the application through the Xerces API. 
1030The shared data structure is implemented using a ring buffer,
1031where every entry contains an independent set of data streams.
1032In the examples of Figure \ref{threads_timeline1} and \ref{threads_timeline2}, the ring buffer has four entries.
1033A lock-free mechanism is applied to ensure that each entry can only be read or written by one thread at the same time.
1034In Figure \ref{threads_timeline1} the processing time of <code class="code">T<sub>1</sub></code> is longer than <code class="code">T<sub>2</sub></code>;
1035thus <code class="code">T<sub>2</sub></code> always waits for <code class="code">T<sub>1</sub></code> to write to the shared memory.
1036Figure \ref{threads_timeline2} illustrates the scenario in which <code class="code">T<sub>1</sub></code> is faster
1037and must wait for <code class="code">T<sub>2</sub></code> to finish reading the shared data before it can reuse the memory space.
1039<p id="idp765984">
1042<p id="idp766256">
1043Overall, our design is intended to benefit a range of applications.
1044Conceptually, we consider two design points.
1045The first, the parsing performed by the Parabix Subsystem dominates at 67% of the overall cost,
1046with the cost of application processing (including the driver logic within the Markup Processor) at 33%.   
1047The second is almost the opposite scenario, the cost of application processing dominates at 60%,
1048while the cost of XML parsing represents an overhead of 40%.
1050<p id="idp766448">
1051Our design is predicated on a goal of using the Parabix
1052framework to achieve a 50% to 100% improvement in the parsing engine itself.   
1053In a best case scenario,
1054a 100% improvement of the Parabix Subsystem for the design point in which
1055XML parsing dominates at 67% of the total application cost.
1056In this case, the single-threaded icXML should achieve a 1.5x speedup over Xerces
1057so that the total application cost reduces to 67% of the original. 
1058However, in icXML-p, our ideal scenario gives us two well-balanced threads
1059each performing about 33% of the original work.   
1060In this case, Amdahl's law predicts that we could expect up to a 3x speedup at best.
1062<p id="idp768176">
1063At the other extreme of our design range, we consider an application
1064in which core parsing cost is 40%.  Assuming the 2x speedup of
1065the Parabix Subsystem over the corresponding Xerces core, single-threaded
1066icXML delivers a 25% speedup.  However, the most significant
1067aspect of our two-stage multi-threaded design then becomes the
1068ability to hide the entire latency of parsing within the serial time
1069required by the application.  In this case, we achieve
1070an overall speedup in processing time by 1.67x.
1072<p id="idp768880">
1073Although the structure of the Parabix Subsystem allows division of the work into
1074several pipeline stages and has been demonstrated to be effective
1075for four pipeline stages in a research prototype \cite{HPCA2012},
1076our analysis here suggests that the further pipelining of work within
1077the Parabix Subsystem is not worthwhile if the cost of application logic is little as
107833% of the end-to-end cost using Xerces.  To achieve benefits of
1079further parallelization with multi-core technology, there would
1080need to be reductions in the cost of application logic that
1081could match reductions in core parsing cost.
1084<div class="section" id="idp769752">
1085<h2 class="title" style="clear: both">Performance</h2>
1086<p id="idp770088">
1087We evaluate Xerces-C++ 3.1.1, icXML, icXML-p against two benchmarking applications:
1088the Xerces C++ SAXCount sample application,
1089and a real world GML to SVG transformation application.
1090We investigated XML parser performance using an Intel Core i7 quad-core
1091(Sandy Bridge) processor (3.40GHz, 4 physical cores, 8 threads (2 per core),
109232+32 kB (per core) L1 cache,
1093256 kB (per core) L2 cache,
10948 MB L3 cache) running the 64-bit version of Ubuntu 12.04 (Linux).
1096<p id="idp770752">
1097We analyzed the execution profiles of each XML parser
1098using the performance counters found in the processor.
1099We chose several key hardware events that provide insight into the profile of each
1100application and indicate if the processor is doing useful work. 
1101The set of events included in our study are:
1102processor cycles, branch instructions, branch mispredictions,
1103and cache misses. The Performance Application Programming Interface
1104(PAPI) Version 5.5.0 \cite{papi} toolkit
1105was installed on the test system to facilitate the
1106collection of hardware performance monitoring
1107statistics. In addition, we used the Linux perf \cite{perf} utility
1108to collect per core hardware events.
1110<div class="section" id="idp771632">
1111<h3 class="title" style="clear: both">Xerces C++ SAXCount</h3>
1112<p id="idp771976">
1113Xerces comes with sample applications that demonstrate salient features of the parser.
1114SAXCount is the simplest such application:
1115it counts the elements, attributes and characters of a given XML file using the (event based) SAX API
1116and prints out the totals.
1118<p id="idp772440">
1121<p id="idp772712">
1122Table \ref{XMLDocChars} shows the document characteristics of the XML input
1123files selected for the Xerces C++ SAXCount benchmark. The jaw.xml
1124represents document-oriented XML inputs and contains the three-byte and four-byte UTF-8 sequence
1125required for the UTF-8 encoding of Japanese characters. The remaining data files are data-oriented
1126XML documents and consist entirely of single byte encoded ASCII characters.
1128<p id="idp772904">
1129A key predictor of the overall parsing performance of an XML file is markup density\footnote{
1130  Markup Density: the ratio of markup bytes used to define the structure of the document vs. its file size.}.
1131This metric has substantial influence on the performance of traditional recursive descent XML parsers
1132because it directly corresponds to the number of state transitions that occur when parsing a document.
1133We use a mixture of document-oriented and
1134data-oriented XML files to analyze performance over a spectrum
1135of markup densities.
1137<p id="idp774656">
1138Figure \ref{perf_SAX} compares the performance of Xerces, icXML and pipelined icXML in terms of
1139CPU cycles per byte for the SAXCount application.
1140The speedup for icXML over Xerces is 1.3x to 1.8x.
1141With two threads on the multicore machine, icXML-p can achieve speedup up to 2.7x.
1142Xerces is substantially slowed by dense markup
1143but icXML is less affected through a reduction in branches and the use of parallel-processing techniques.
1144icXML-p performs better as markup-density increases because the work performed by each stage is
1145well balanced in this application.
1147<p id="idp775424">
1151<div class="section" id="idp776016">
1152<h3 class="title" style="clear: both">GML2SVG</h3>
1153<p id="idp776336"></p>
1156<div class="section" id="idp776592">
1157<h2 class="title" style="clear: both">Conclusion and Future Work</h2>
1158<p id="idp776944">
1159This paper is the first case study documenting the significant
1160performance benefits that may be realized through the integration
1161of parallel bitstream technology into existing widely-used software libraries.
1162In the case of the Xerces-C++ XML parser, the
1163combined integration of SIMD and multicore parallelism was
1164shown capable of dramatic producing dramatic increases in
1165throughput and reductions in branch mispredictions and cache misses.
1166The modified parser, going under the name icXML is designed
1167to provide the full functionality of the original Xerces library
1168with complete compatibility of APIs.  Although substantial
1169re-engineering was required to realize the
1170performance potential of parallel technologies, this
1171is an important case study demonstrating the general
1172feasibility of these techniques.
1174<p id="idp778768">
1175The further development of icXML to move beyond 2-stage
1176pipeline parallelism is ongoing, with realistic prospects for
1177four reasonably balanced stages within the library.  For
1178applications such as GML2SVG which are dominated by time
1179spent on XML parsing, such a multistage pipelined parsing
1180library should offer substantial benefits. 
1182<p id="idp779304">
1183The example of XML parsing may be considered prototypical
1184of finite-state machines applications which have sometimes
1185been considered "embarassingly sequential" and so
1186difficult to parallelize that "nothing works."  So the
1187case study presented here should be considered an important
1188data point in making the case that parallelization can
1189indeed be helpful across a broad array of application types.
1191<p id="idp780416">
1192To overcome the software engineering challenges in applying
1193parallel bitstream technology to existing software systems,
1194it is clear that better library and tool support is needed.
1195The techniques used in the implementation of icXML and
1196documented in this paper could well be generalized for
1197applications in other contexts and automated through
1198the creation of compiler technology specifically supporting
1199parallel bitstream programming.
1202<div class="bibliography" id="idp781376">
1203<h2 class="title" style="clear:both">Bibliography</h2>
1204<p class="bibliomixed" id="XMLChip09">[Leventhal and Lemoine 2009] Leventhal, Michael and
1205         Eric Lemoine 2009. The XML chip at 6 years. Proceedings of International Symposium on
1206         Processing XML Efficiently 2009, Montréal.</p>
1207<p class="bibliomixed" id="Datapower09">[Salz, Achilles and Maze 2009] Salz, Richard,
1208         Heather Achilles, and David Maze. 2009. Hardware and software trade-offs in the IBM
1209         DataPower XML XG4 processor card. Proceedings of International Symposium on Processing XML
1210         Efficiently 2009, Montréal.</p>
1211<p class="bibliomixed" id="PPoPP08">[Cameron 2007] Cameron, Robert D. 2007. A Case Study
1212         in SIMD Text Processing with Parallel Bit Streams UTF-8 to UTF-16 Transcoding. Proceedings
1213         of 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming 2008, Salt
1214         Lake City, Utah. On the Web at <a href="" class="link" target="_new"></a>.</p>
1215<p class="bibliomixed" id="CASCON08">[Cameron, Herdy and Lin 2008] Cameron, Robert D.,
1216         Kenneth S Herdy, and Dan Lin. 2008. High Performance XML Parsing Using Parallel Bit Stream
1217         Technology. Proceedings of CASCON 2008. 13th ACM SIGPLAN Symposium on Principles and
1218         Practice of Parallel Programming 2008, Toronto.</p>
1219<p class="bibliomixed" id="SVGOpen08">[Herdy, Burggraf and Cameron 2008] Herdy, Kenneth
1220         S., Robert D. Cameron and David S. Burggraf. 2008. High Performance GML to SVG
1221         Transformation for the Visual Presentation of Geographic Data in Web-Based Mapping Systems.
1222         Proceedings of SVG Open 6th International Conference on Scalable Vector Graphics,
1223         Nuremburg. On the Web at
1224            <a href="" class="link" target="_new"></a>.</p>
1225<p class="bibliomixed" id="Ross06">[Ross 2006] Ross, Kenneth A. 2006. Efficient hash
1226         probes on modern processors. Proceedings of ICDE, 2006. ICDE 2006, Atlanta. On the Web at
1227            <a href="" class="link" target="_new"></a>.</p>
1228<p class="bibliomixed" id="ASPLOS09">[Cameron and Lin 2009] Cameron, Robert D. and Dan
1229         Lin. 2009. Architectural Support for SWAR Text Processing with Parallel Bit Streams: The
1230         Inductive Doubling Principle. Proceedings of ASPLOS 2009, Washington, DC.</p>
1231<p class="bibliomixed" id="Wu08">[Wu et al. 2008] Wu, Yu, Qi Zhang, Zhiqiang Yu and
1232         Jianhui Li. 2008. A Hybrid Parallel Processing for XML Parsing and Schema Validation.
1233         Proceedings of Balisage 2008, Montréal. On the Web at
1234            <a href="" class="link" target="_new"></a>.</p>
1235<p class="bibliomixed" id="u8u16">[Cameron 2008] u8u16 - A High-Speed UTF-8 to UTF-16
1236         Transcoder Using Parallel Bit Streams Technical Report 2007-18. 2007. School of Computing
1237         Science Simon Fraser University, June 21 2007.</p>
1238<p class="bibliomixed" id="XML10">[XML 1.0] Extensible Markup Language (XML) 1.0 (Fifth
1239         Edition) W3C Recommendation 26 November 2008. On the Web at
1240            <a href="" class="link" target="_new"></a>.</p>
1241<p class="bibliomixed" id="Unicode">[Unicode] The Unicode Consortium. 2009. On the Web at
1242            <a href="" class="link" target="_new"></a>.</p>
1243<p class="bibliomixed" id="Pex06">[Hilewitz and Lee 2006]  Hilewitz, Y. and Ruby B. Lee.
1244         2006. Fast Bit Compression and Expansion with Parallel Extract and Parallel Deposit
1245         Instructions. Proceedings of the IEEE 17th International Conference on Application-Specific
1246         Systems, Architectures and Processors (ASAP), pp. 65-72, September 11-13, 2006.</p>
1247<p class="bibliomixed" id="InfoSet">[XML Infoset] XML Information Set (Second Edition) W3C
1248         Recommendation 4 February 2004. On the Web at
1249         <a href="" class="link" target="_new"></a>.</p>
1250<p class="bibliomixed" id="Saxon">[Saxon] SAXON The XSLT and XQuery Processor. On the Web
1251         at <a href="" class="link" target="_new"></a>.</p>
1252<p class="bibliomixed" id="Kay08">[Kay 2008]  Kay, Michael Y. 2008. Ten Reasons Why Saxon
1253         XQuery is Fast, IEEE Data Engineering Bulletin, December 2008.</p>
1254<p class="bibliomixed" id="AElfred">[Ælfred]  The Ælfred XML Parser. On the Web at
1255            <a href="" class="link" target="_new"></a>.</p>
1256<p class="bibliomixed" id="JNI">[Hitchens 2002] Hitchens, Ron. Java NIO. O'Reilly, 2002.</p>
1257<p class="bibliomixed" id="Expat">[Expat] The Expat XML Parser.
1258            <a href="" class="link" target="_new"></a>.</p>
1261<div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em; font-style: italic">Balisage Series on Markup Technologies</h3></div>
1265<div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em">
1266<i>Balisage:</i> <small>The Markup Conference</small>
Note: See TracBrowser for help on using the repository browser.