Changeset 3047 for docs


Ignore:
Timestamp:
Apr 19, 2013, 2:39:33 PM (6 years ago)
Author:
cameron
Message:

Xerces function and XML document characteristics tables

Location:
docs/Balisage13/Bal2013came0601
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • docs/Balisage13/Bal2013came0601/Bal2013came0601.html

    r3045 r3047  
    1 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
    21<html lang="en">
    32<head>
    43<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
    54<title></title>
    6 <link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
     5<link rel="stylesheet" href="balisage-plain.css" type="text/css">
    76<meta name="keywords" content="">
    87</head>
     
    1110<i>Balisage:</i> <small>The Markup Conference</small>
    1211</h1></div>
    13 <html lang="en">
    14 <head>
    15 <title></title>
    16 <link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
    17 <meta name="generator" content="Balisage Conference Proceedings XSLT (v1.2)">
    18 <script type="text/javascript">
    19     function toggle(folderID) {
    20       folder = document.getElementById("folder-"+folderID);
    21       icon = document.getElementById("icon-"+folderID)
    22       // need to:
    23       //   switch folder.style.display between 'none' and 'block'
    24       //   switch between collapse and expand icons
    25       if (folder.style.display != "block") {
    26         folder.style.display = "block";
    27         icon.src = "minus.png" ;
    28         icon.alt = "collapse" ;
    29       }
    30       else {
    31         folder.style.display = "none";
    32         icon.src = "plus.png" ;
    33         icon.alt = "expand" ;
    34       };
    35       return;
    36     }
    37 
    38    function hidecite(citeID) {
    39      cite = document.getElementById(citeID);
    40      cite.style.display = "none";
    41      return;
    42    }
    43    
    44    function showcite(citeID,anchorID) {
    45      cite = document.getElementById(citeID);
    46 
    47      citeLeft = cite.style.left;
    48      citeTop = cite.style.top;
    49      
    50      if (citeLeft != (getLeft(anchorID)+"px") ||
    51          citeTop != (getTop(anchorID)+"px")) {
    52        cite.style.display = "none";
    53      }
    54      
    55      if (cite.style.display != "table-cell") {
    56         movebox(citeID, anchorID);
    57         cite.style.display = "table-cell";
    58      }
    59      else {
    60        cite.style.display = "none";
    61      };
    62      return;
    63    }
    64 
    65    function movebox(citeID, anchorID) {
    66 
    67      cite = document.getElementById(citeID);
    68      
    69      // alert(cite.offsetWidth + " by " + cite.offsetHeight)
    70      
    71      horizontalOffset = getLeft(anchorID);
    72      // horizontalOffset = (inMain(anchorID)) ?
    73      // (horizontalOffset - 260) : (horizontalOffset + 20)
    74      // (horizontalOffset - (20 + cite.offsetWidth)) : (horizontalOffset + 20)
    75 
    76      verticalOffset = getTop(anchorID);
    77      // verticalOffset = (inMain(anchorID)) ?
    78      // (verticalOffset - 20) : (verticalOffset + 20)
    79      // (verticalOffset - (20 + cite.offsetHeight)) : (verticalOffset + 20)
    80 
    81      /*
    82      horizontalOffset = getAbsoluteLeft(anchorID) - getScrollLeft(anchorID) + 20;
    83      if (inMain(anchorID)) {
    84        horizontalOffset = horizontalOffset - 300;
    85      }
    86      verticalOffset = getAbsoluteTop(anchorID) - getScrollTop(anchorID) - 40;
    87      if (inMain(anchorID)) {
    88        verticalOffset = verticalOffset - 300;
    89      }
    90      */
    91      
    92      cite.style.left = horizontalOffset + "px";
    93      cite.style.top = verticalOffset + "px";
    94    }
    95    
    96    function getLeft(objectID) {
    97      var left = getAbsoluteLeft(objectID) - getScrollLeft(objectID);
    98      left = (inMain(objectID)) ? (left - 260) : (left + 20)   
    99      return left;
    100    }
    101    
    102    function getTop(objectID) {
    103      var top = getAbsoluteTop(objectID) - getScrollTop(objectID);
    104      top = (inMain(objectID)) ? (top - 50) : (top + 20)
    105      return top;     
    106    }
    107    
    108    function getAbsoluteLeft(objectId) {
    109    // Get an object left position from the upper left viewport corner
    110      o = document.getElementById(objectId)
    111      oLeft = o.offsetLeft            // Get left position from the parent object
    112      while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
    113        oParent = o.offsetParent    // Get parent object reference
    114        oLeft += oParent.offsetLeft // Add parent left position
    115        o = oParent
    116       }
    117     return oLeft
    118     }
    119 
    120     function getAbsoluteTop(objectId) {
    121     // Get an object top position from the upper left viewport corner
    122       o = document.getElementById(objectId)
    123       oTop = o.offsetTop            // Get top position from the parent object
    124       while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
    125         oParent = o.offsetParent  // Get parent object reference
    126         oTop += oParent.offsetTop // Add parent top position
    127         o = oParent
    128       }
    129     return oTop
    130     }
    131 
    132    function getScrollLeft(objectId) {
    133      // Get a left scroll position
    134      o = document.getElementById(objectId)
    135      oLeft = o.scrollLeft            // Get left position from the parent object
    136      while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
    137        oParent = o.offsetParent    // Get parent object reference
    138        oLeft += oParent.scrollLeft // Add parent left position
    139        o = oParent
    140       }
    141     return oLeft
    142     }
    143 
    144     function getScrollTop(objectId) {
    145     // Get a right scroll position
    146       o = document.getElementById(objectId)
    147       oTop = o.scrollTop            // Get top position from the parent object
    148       while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
    149         oParent = o.offsetParent  // Get parent object reference
    150         oTop += oParent.scrollTop // Add parent top position
    151         o = oParent
    152       }
    153     return oTop
    154     }
    155 
    156     function inMain(objectId) {
    157     // returns true if in div#main
    158       o = document.getElementById(objectId)
    159       while(o.parentNode != null) { // Parse the parent hierarchy up to div#main
    160         oParent = o.parentNode
    161         if (o.id == "main") { return true; }
    162         o = oParent;
    163       }
    164     return false;
    165     }
    166 
    167 
    168    /*
    169    function showcite(citeID) {
    170       cite = document.getElementById(citeID);
    171       if (cite.style.display != "table-cell") {
    172         cite.style.display = "table-cell";
    173       }
    174       else {
    175         cite.style.display = "none";
    176       };
    177       return;
    178     }
    179     */
    180 
    181       </script>
    182 </head>
    183 <body>
    184 <div class="inline-citation" id="cite-XMLChip09" style="display:none;width: 240px">
    185 <a class="quiet" href="javascript:hidecite('cite-XMLChip09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Leventhal, Michael and
    186          Eric Lemoine 2009. The XML chip at 6 years. Proceedings of International Symposium on
    187          Processing XML Efficiently 2009, Montréal.</p>
    188 </div>
    189 <div class="inline-citation" id="cite-Datapower09" style="display:none;width: 240px">
    190 <a class="quiet" href="javascript:hidecite('cite-Datapower09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Salz, Richard,
    191          Heather Achilles, and David Maze. 2009. Hardware and software trade-offs in the IBM
    192          DataPower XML XG4 processor card. Proceedings of International Symposium on Processing XML
    193          Efficiently 2009, Montréal.</p>
    194 </div>
    195 <div class="inline-citation" id="cite-PPoPP08" style="display:none;width: 240px">
    196 <a class="quiet" href="javascript:hidecite('cite-PPoPP08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D. 2007. A Case Study
    197          in SIMD Text Processing with Parallel Bit Streams UTF-8 to UTF-16 Transcoding. Proceedings
    198          of 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming 2008, Salt
    199          Lake City, Utah. On the Web at <a href="http://research.ihost.com/ppopp08/" class="link" target="_new">http://research.ihost.com/ppopp08/</a>.</p>
    200 </div>
    201 <div class="inline-citation" id="cite-CASCON08" style="display:none;width: 240px">
    202 <a class="quiet" href="javascript:hidecite('cite-CASCON08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D.,
    203          Kenneth S Herdy, and Dan Lin. 2008. High Performance XML Parsing Using Parallel Bit Stream
    204          Technology. Proceedings of CASCON 2008. 13th ACM SIGPLAN Symposium on Principles and
    205          Practice of Parallel Programming 2008, Toronto.</p>
    206 </div>
    207 <div class="inline-citation" id="cite-SVGOpen08" style="display:none;width: 240px">
    208 <a class="quiet" href="javascript:hidecite('cite-SVGOpen08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Herdy, Kenneth
    209          S., Robert D. Cameron and David S. Burggraf. 2008. High Performance GML to SVG
    210          Transformation for the Visual Presentation of Geographic Data in Web-Based Mapping Systems.
    211          Proceedings of SVG Open 6th International Conference on Scalable Vector Graphics,
    212          Nuremburg. On the Web at
    213             <a href="http://www.svgopen.org/2008/papers/74-HighPerformance_GML_to_SVG_Transformation_for_the_Visual_Presentation_of_Geographic_Data_in_WebBased_Mapping_Systems/" class="link" target="_new">http://www.svgopen.org/2008/papers/74-HighPerformance_GML_to_SVG_Transformation_for_the_Visual_Presentation_of_Geographic_Data_in_WebBased_Mapping_Systems/</a>.</p>
    214 </div>
    215 <div class="inline-citation" id="cite-Ross06" style="display:none;width: 240px">
    216 <a class="quiet" href="javascript:hidecite('cite-Ross06')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Ross, Kenneth A. 2006. Efficient hash
    217          probes on modern processors. Proceedings of ICDE, 2006. ICDE 2006, Atlanta. On the Web at
    218             <a href="www.cs.columbia.edu/~kar/pubsk/icde2007.pdf" class="link" target="_new">www.cs.columbia.edu/~kar/pubsk/icde2007.pdf</a>.</p>
    219 </div>
    220 <div class="inline-citation" id="cite-ASPLOS09" style="display:none;width: 240px">
    221 <a class="quiet" href="javascript:hidecite('cite-ASPLOS09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D. and Dan
    222          Lin. 2009. Architectural Support for SWAR Text Processing with Parallel Bit Streams: The
    223          Inductive Doubling Principle. Proceedings of ASPLOS 2009, Washington, DC.</p>
    224 </div>
    225 <div class="inline-citation" id="cite-Wu08" style="display:none;width: 240px">
    226 <a class="quiet" href="javascript:hidecite('cite-Wu08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Wu, Yu, Qi Zhang, Zhiqiang Yu and
    227          Jianhui Li. 2008. A Hybrid Parallel Processing for XML Parsing and Schema Validation.
    228          Proceedings of Balisage 2008, Montréal. On the Web at
    229             <a href="http://www.balisage.net/Proceedings/vol1/html/Wu01/BalisageVol1-Wu01.html" class="link" target="_new">http://www.balisage.net/Proceedings/vol1/html/Wu01/BalisageVol1-Wu01.html</a>.</p>
    230 </div>
    231 <div class="inline-citation" id="cite-u8u16" style="display:none;width: 240px">
    232 <a class="quiet" href="javascript:hidecite('cite-u8u16')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">u8u16 - A High-Speed UTF-8 to UTF-16
    233          Transcoder Using Parallel Bit Streams Technical Report 2007-18. 2007. School of Computing
    234          Science Simon Fraser University, June 21 2007.</p>
    235 </div>
    236 <div class="inline-citation" id="cite-XML10" style="display:none;width: 240px">
    237 <a class="quiet" href="javascript:hidecite('cite-XML10')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Extensible Markup Language (XML) 1.0 (Fifth
    238          Edition) W3C Recommendation 26 November 2008. On the Web at
    239             <a href="http://www.w3.org/TR/REC-xml/" class="link" target="_new">http://www.w3.org/TR/REC-xml/</a>.</p>
    240 </div>
    241 <div class="inline-citation" id="cite-Unicode" style="display:none;width: 240px">
    242 <a class="quiet" href="javascript:hidecite('cite-Unicode')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">The Unicode Consortium. 2009. On the Web at
    243             <a href="http://unicode.org/" class="link" target="_new">http://unicode.org/</a>.</p>
    244 </div>
    245 <div class="inline-citation" id="cite-Pex06" style="display:none;width: 240px">
    246 <a class="quiet" href="javascript:hidecite('cite-Pex06')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> Hilewitz, Y. and Ruby B. Lee.
    247          2006. Fast Bit Compression and Expansion with Parallel Extract and Parallel Deposit
    248          Instructions. Proceedings of the IEEE 17th International Conference on Application-Specific
    249          Systems, Architectures and Processors (ASAP), pp. 65-72, September 11-13, 2006.</p>
    250 </div>
    251 <div class="inline-citation" id="cite-InfoSet" style="display:none;width: 240px">
    252 <a class="quiet" href="javascript:hidecite('cite-InfoSet')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">XML Information Set (Second Edition) W3C
    253          Recommendation 4 February 2004. On the Web at
    254          <a href="http://www.w3.org/TR/xml-infoset/" class="link" target="_new">http://www.w3.org/TR/xml-infoset/</a>.</p>
    255 </div>
    256 <div class="inline-citation" id="cite-Saxon" style="display:none;width: 240px">
    257 <a class="quiet" href="javascript:hidecite('cite-Saxon')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">SAXON The XSLT and XQuery Processor. On the Web
    258          at <a href="http://saxon.sourceforge.net/" class="link" target="_new">http://saxon.sourceforge.net/</a>.</p>
    259 </div>
    260 <div class="inline-citation" id="cite-Kay08" style="display:none;width: 240px">
    261 <a class="quiet" href="javascript:hidecite('cite-Kay08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> Kay, Michael Y. 2008. Ten Reasons Why Saxon
    262          XQuery is Fast, IEEE Data Engineering Bulletin, December 2008.</p>
    263 </div>
    264 <div class="inline-citation" id="cite-AElfred" style="display:none;width: 240px">
    265 <a class="quiet" href="javascript:hidecite('cite-AElfred')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> The Ælfred XML Parser. On the Web at
    266             <a href="http://saxon.sourceforge.net/aelfred.html" class="link" target="_new">http://saxon.sourceforge.net/aelfred.html</a>.</p>
    267 </div>
    268 <div class="inline-citation" id="cite-JNI" style="display:none;width: 240px">
    269 <a class="quiet" href="javascript:hidecite('cite-JNI')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Hitchens, Ron. Java NIO. O'Reilly, 2002.</p>
    270 </div>
    271 <div class="inline-citation" id="cite-Expat" style="display:none;width: 240px">
    272 <a class="quiet" href="javascript:hidecite('cite-Expat')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">The Expat XML Parser.
    273             <a href="http://expat.sourceforge.net/" class="link" target="_new">http://expat.sourceforge.net/</a>.</p>
    274 </div>
    275 <div id="mast"><div class="content">
    276 <h2 class="article-title" id="idp543912"></h2>
     12<div lang="en" class="article">
     13<div class="titlepage">
     14<h2 class="article-title" id="idp75552"></h2>
    27715<div class="author">
    27816<h3 class="author">Nigel Medforth</h3>
     
    32361<h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:"></a>&gt;</code></h5>
    32462</div>
    325 <div class="mast-box">
    326 <p class="title"><a href="javascript:toggle('idp544272')" class="quiet"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp544272"></a> <span onclick="javascript:toggle('idp544272');return true">Abstract</span></p>
    327 <div class="folder" id="folder-idp544272" style="display:none"><p id="idp544728">Prior research on the acceleration of XML processing
    328 using SIMD and multi-core parallelism has lead to
    329 a number of interesting research prototypes.  This work
    330 investigates the extent to which the techniques underlying
    331 these prototypes could result in systematic performance
    332 benefits when fully integrated into a commercial XML parser.
    333 The widely used Xerces-C++ parser of the Apache Software
    334 Foundation was chosen as the foundation for the study.
    335 A systematic restructuring of the parser was undertaken,
    336 while maintaining the existing API for application programmers.
    337 Using SIMD techniques alone, an increase in parsing speed
    338 of at least 50% was observed in a range of applications.
    339 When coupled with pipeline parallelism on dual core processors,
    340 improvements of 2x and beyond were realized.
    341 </p></div>
     63<div class="abstract">
     64<p class="title"><b>Abstract</b></p>
     65<p id="idp76976">Prior research on the acceleration of XML processing using SIMD and multi-core
     66            parallelism has lead to a number of interesting research prototypes. This work
     67            investigates the extent to which the techniques underlying these prototypes could result
     68            in systematic performance benefits when fully integrated into a commercial XML parser.
     69            The widely used Xerces-C++ parser of the Apache Software Foundation was chosen as the
     70            foundation for the study. A systematic restructuring of the parser was undertaken, while
     71            maintaining the existing API for application programmers. Using SIMD techniques alone,
     72            an increase in parsing speed of at least 50% was observed in a range of applications.
     73            When coupled with pipeline parallelism on dual core processors, improvements of 2x and
     74            beyond were realized. </p>
     75</div>
     76<hr>
    34277</div>
    34378<div class="toc">
    34479<p><b>Table of Contents</b></p>
    34580<dl>
    346 <dt><span class="section"><a href="#idp531824" class="toc">Introduction</a></span></dt>
    347 <dt><span class="section"><a href="#idp532720" class="toc">Background</a></span></dt>
     81<dt><span class="section"><a href="#idp285120" class="toc">Introduction</a></span></dt>
     82<dt><span class="section"><a href="#idp286912" class="toc">Background</a></span></dt>
    34883<dd><dl>
    349 <dt><span class="section"><a href="#idp533040" class="toc">Xerces C++ Structure</a></span></dt>
    350 <dt><span class="section"><a href="#idp539296" class="toc">The Parabix Framework</a></span></dt>
    351 <dt><span class="section"><a href="#idp690352" class="toc">Sequential vs. Parallel Paradigm</a></span></dt>
     84<dt><span class="section"><a href="#idp287552" class="toc">Xerces C++ Structure</a></span></dt>
     85<dt><span class="section"><a href="#idp331216" class="toc">The Parabix Framework</a></span></dt>
     86<dt><span class="section"><a href="#idp40512" class="toc">Sequential vs. Parallel Paradigm</a></span></dt>
    35287</dl></dd>
    353 <dt><span class="section"><a href="#idp693304" class="toc">Architecture</a></span></dt>
     88<dt><span class="section"><a href="#idp44272" class="toc">Architecture</a></span></dt>
    35489<dd><dl>
    355 <dt><span class="section"><a href="#idp693624" class="toc">Overview</a></span></dt>
    356 <dt><span class="section"><a href="#idp706920" class="toc">Character Set Adapters</a></span></dt>
    357 <dt><span class="section"><a href="#idp509944" class="toc">Combined Parallel Filtering</a></span></dt>
    358 <dt><span class="section"><a href="#idp733168" class="toc">Content Stream</a></span></dt>
    359 <dt><span class="section"><a href="#idp740320" class="toc">Namespace Handling</a></span></dt>
    360 <dt><span class="section"><a href="#idp751920" class="toc">Error Handling</a></span></dt>
     90<dt><span class="section"><a href="#idp366784" class="toc">Overview</a></span></dt>
     91<dt><span class="section"><a href="#idp390240" class="toc">Character Set Adapters</a></span></dt>
     92<dt><span class="section"><a href="#idp398192" class="toc">Combined Parallel Filtering</a></span></dt>
     93<dt><span class="section"><a href="#idp416848" class="toc">Content Stream</a></span></dt>
     94<dt><span class="section"><a href="#idp427456" class="toc">Namespace Handling</a></span></dt>
     95<dt><span class="section"><a href="#idp447600" class="toc">Error Handling</a></span></dt>
    36196</dl></dd>
    362 <dt><span class="section"><a href="#idp758960" class="toc">Multithreading with Pipeline Parallelism</a></span></dt>
    363 <dt><span class="section"><a href="#idp769752" class="toc">Performance</a></span></dt>
     97<dt><span class="section"><a href="#idp458640" class="toc">Multithreading with Pipeline Parallelism</a></span></dt>
     98<dt><span class="section"><a href="#idp475920" class="toc">Performance</a></span></dt>
    36499<dd><dl>
    365 <dt><span class="section"><a href="#idp771632" class="toc">Xerces C++ SAXCount</a></span></dt>
    366 <dt><span class="section"><a href="#idp776016" class="toc">GML2SVG</a></span></dt>
     100<dt><span class="section"><a href="#idp478640" class="toc">Xerces C++ SAXCount</a></span></dt>
     101<dt><span class="section"><a href="#idp499872" class="toc">GML2SVG</a></span></dt>
    367102</dl></dd>
    368 <dt><span class="section"><a href="#idp776592" class="toc">Conclusion and Future Work</a></span></dt>
     103<dt><span class="section"><a href="#idp501024" class="toc">Conclusion and Future Work</a></span></dt>
    369104</dl>
    370105</div>
    371 <div class="mast-box">
    372 <p class="title"><a href="javascript:toggle('idp545808')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp545808"></a> <span onclick="javascript:toggle('idp545808');return true">Nigel Medforth</span></p>
    373 <div class="folder" id="folder-idp545808" style="display:none">
    374 <h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:nmedfort@sfu.ca">nmedfort@sfu.ca</a>&gt;</code></h5>
    375 <div class="affiliation">
    376 <p class="jobtitle">Developer</p>
    377 <p class="orgname">International Characters Inc.</p>
    378 </div>
    379 <div class="affiliation">
    380 <p class="jobtitle">Graduate Student, School of Computing Science</p>
    381 <p class="orgname">Simon Fraser University </p>
    382 </div>
    383 <div class="personblurb">
    384 <p id="idp546720">Nigel Medforth is a M.Sc. student at Simon Fraser University and the lead developer of icXML.
    385              He earned a Bachelor of Technology in Information Technology at Kwantlen Polytechnic University in 2009
    386              and was awarded the Dean’s Medal for Outstanding Achievement.</p>
    387 <p id="idp547496">Nigel is currently researching ways to leverage both the Parabix framework and stream-processing models
    388              to further accelerate XML parsing within icXML.</p>
    389 </div>
    390 </div>
    391 </div>
    392 <div class="mast-box">
    393 <p class="title"><a href="javascript:toggle('idp525576')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp525576"></a> <span onclick="javascript:toggle('idp525576');return true">Kenneth Herdy</span></p>
    394 <div class="folder" id="folder-idp525576" style="display:none">
    395 <h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:ksherdy@sfu.ca">ksherdy@sfu.ca</a>&gt;</code></h5>
    396 <div class="affiliation">
    397 <p class="jobtitle">Graduate Student, School of Computing Science</p>
    398 <p class="orgname">Simon Fraser University </p>
    399 </div>
    400 <div class="personblurb">
    401 <p id="idp526440"> Ken Herdy completed an Advanced Diploma of Technology in Geographical Information
    402                Systems at the British Columbia Institute of Technology in 2003 and earned a Bachelor
    403                of Science in Computing Science with a Certificate in Spatial Information Systems at
    404                Simon Fraser University in 2005.
    405                                                 </p>
    406 <p id="idp526976"> Ken is currently pursuing PhD studies in Computing Science at Simon Fraser
    407                University with industrial scholarship support from the Natural Sciences and
    408                Engineering Research Council of Canada, the Mathematics of Information Technology and
    409                Complex Systems NCE, and the BC Innovation Council. His research focus is an analysis
    410                of the principal techniques that may be used to improve XML processing performance in
    411                the context of the Geography Markup Language (GML).
    412                                                 </p>
    413 </div>
    414 </div>
    415 </div>
    416 <div class="mast-box">
    417 <p class="title"><a href="javascript:toggle('idp528720')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp528720"></a> <span onclick="javascript:toggle('idp528720');return true">Rob Cameron</span></p>
    418 <div class="folder" id="folder-idp528720" style="display:none">
    419 <h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:cameron@cs.sfu.ca">cameron@cs.sfu.ca</a>&gt;</code></h5>
    420 <div class="affiliation">
    421 <p class="jobtitle">Professor of Computing Science</p>
    422 <p class="orgname">Simon Fraser University</p>
    423 </div>
    424 <div class="affiliation">
    425 <p class="jobtitle">Chief Technology Officer</p>
    426 <p class="orgname">International Characters, Inc.</p>
    427 </div>
    428 <div class="personblurb"><p id="idp1352">Dr. Rob Cameron is Professor of Computing Science and Associate Dean of
    429              Applied Sciences at Simon Fraser
    430                 University.   His research interests include programming language
    431                 and software system technology, with a specific focus on high performance
    432                 text processing using SIMD and multicore parallelism.  He is the developer
    433                 of the REX XML shallow parser as well as the parallel bit stream (Parabix)
    434                 framework for SIMD text processing. 
    435               </p></div>
    436 </div>
    437 </div>
    438 </div></div>
    439 <div id="navbar"></div>
    440 <div id="balisage-header" style="background-color: #6699CC">
    441 <a class="quiet" href="http://www.balisage.net"><img style="float:right;border:none" alt="Balisage logo" height="130" src="http://balisage.net/Logo/BalisageSeries-logo.png"></a><h2 class="page-header">Balisage: The Markup Conference</h2>
    442 <h1 class="page-header">Proceedings preview</h1>
    443 </div>
    444 <div id="main">
    445 <div class="article">
    446 <h2 class="article-title" id="idp543912"></h2>
    447 <div class="section" id="idp531824">
     106<div class="section" id="idp285120">
    448107<h2 class="title" style="clear: both">Introduction</h2>
    449 <p id="idp532144"></p>
    450 <p id="idp532272"></p>
    451 <p id="idp532400"></p>
    452 <p id="idp532528"></p>
    453 </div>
    454 <div class="section" id="idp532720">
     108<p id="idp285760"></p>
     109<p id="idp286016"></p>
     110<p id="idp286272"></p>
     111<p id="idp286528"></p>
     112</div>
     113<div class="section" id="idp286912">
    455114<h2 class="title" style="clear: both">Background</h2>
    456 <div class="section" id="idp533040">
     115<div class="section" id="idp287552">
    457116<h3 class="title" style="clear: both">Xerces C++ Structure</h3>
    458 <p id="idp533360">
    459 The Xerces C++ parser
    460 
    461 
    462 
    463 
    464 features comprehensive support for a variety of character encodings
    465 both commonplace (e.g., UTF-8, UTF-16) and rarely used (e.g., EBCDIC), support for
    466 multiple XML vocabularies through the XML namespace
    467 mechanism, as well as complete implementations
    468 of structure and data validation through multiple grammars
    469 declared using either legacy DTDs (document type
    470 definitions) or modern XML Schema facilities.
    471 Xerces also supports several APIs for accessing
    472 parser services, including event-based parsing
    473 using either pull parsing or SAX/SAX2 push-style
    474 parsing as well as a DOM tree-based parsing interface.
     117<p id="idp288192"> The Xerces C++ parser
     118           
     119           
     120             features comprehensive support for a variety of character encodings both
     121            commonplace (e.g., UTF-8, UTF-16) and rarely used (e.g., EBCDIC), support for multiple
     122            XML vocabularies through the XML namespace mechanism, as well as complete
     123            implementations of structure and data validation through multiple grammars declared
     124            using either legacy DTDs (document type definitions) or modern XML Schema facilities.
     125            Xerces also supports several APIs for accessing parser services, including event-based
     126            parsing using either pull parsing or SAX/SAX2 push-style parsing as well as a DOM
     127            tree-based parsing interface. </p>
     128<p id="idp290320">
     129           
     130           
     131             Xerces,
     132            like all traditional parsers, processes XML documents sequentially a byte-at-a-time from
     133            the first to the last byte of input data. Each byte passes through several processing
     134            layers and is classified and eventually validated within the context of the document
     135            state. This introduces implicit dependencies between the various tasks within the
     136            application that make it difficult to optimize for performance. As a complex software
     137            system, no one feature dominates the overall parsing performance. Figure
     138            \ref{fig:xerces-profile} shows the execution time profile of the top ten functions in a
     139            typical run. Even if it were possible, Amdahl's Law dictates that tackling any one of
     140            these functions for parallelization in isolation would only produce a minute improvement
     141            in performance. Unfortunately, early investigation into these functions found that
     142            incorporating speculation-free thread-level parallelization was impossible and they were
     143            already performing well in their given tasks; thus only trivial enhancements were
     144            attainable. In order to obtain a systematic acceleration of Xerces, it should be
     145            expected that a comprehensive restructuring is required, involving all aspects of the
     146            parser. </p>
     147<div class="table-wrapper" id="idp293248">
     148<p class="title">Table I</p>
     149<div class="caption"><p id="idp8080">Execution Time of Top 10 Xerces Functions</p></div>
     150<table class="table">
     151<colgroup span="1">
     152<col align="left" valign="top" span="1">
     153<col align="left" valign="top" span="1">
     154</colgroup>
     155<thead><tr>
     156<th>Time (%) </th>
     157<th> Function Name </th>
     158</tr></thead>
     159<tbody>
     160<tr valign="top">
     161<td>13.29       </td>
     162<td>XMLUTF8Transcoder::transcodeFrom </td>
     163</tr>
     164<tr valign="top">
     165<td>7.45        </td>
     166<td>IGXMLScanner::scanCharData </td>
     167</tr>
     168<tr valign="top">
     169<td>6.83        </td>
     170<td>memcpy </td>
     171</tr>
     172<tr valign="top">
     173<td>5.83        </td>
     174<td>XMLReader::getNCName </td>
     175</tr>
     176<tr valign="top">
     177<td>4.67        </td>
     178<td>IGXMLScanner::buildAttList </td>
     179</tr>
     180<tr valign="top">
     181<td>4.54        </td>
     182<td>RefHashTableO&lt;&gt;::findBucketElem </td>
     183</tr>
     184<tr valign="top">
     185<td>4.20        </td>
     186<td>IGXMLScanner::scanStartTagNS </td>
     187</tr>
     188<tr valign="top">
     189<td>3.75        </td>
     190<td>ElemStack::mapPrefixToURI </td>
     191</tr>
     192<tr valign="top">
     193<td>3.58        </td>
     194<td>ReaderMgr::getNextChar </td>
     195</tr>
     196<tr valign="top">
     197<td>3.20        </td>
     198<td>IGXMLScanner::basicAttrValueScan </td>
     199</tr>
     200</tbody>
     201</table>
     202</div>
     203</div>
     204<div class="section" id="idp331216">
     205<h3 class="title" style="clear: both">The Parabix Framework</h3>
     206<p id="idp331888"> The Parabix (parallel bit stream) framework is a transformative approach to XML
     207            parsing (and other forms of text processing.) The key idea is to exploit the
     208            availability of wide SIMD registers (e.g., 128-bit) in commodity processors to represent
     209            data from long blocks of input data by using one register bit per single input byte. To
     210            facilitate this, the input data is first transposed into a set of basis bit streams. In
     211           
     212            Boolean-logic operations\footnote{∧, \√ and ¬ denote the
     213            boolean AND, OR and NOT operators.} are used to classify the input bits into a set of
     214               <span class="ital">character-class bit streams</span>, which identify key
     215            characters (or groups of characters) with a <code class="code">1</code>. For example, one of the
     216            fundamental characters in XML is a left-angle bracket. A character is an
     217               <code class="code">'&lt;' if and only if
     218               Â¬(b<sub>0</sub> √ b<sub>1</sub>)
     219               âˆ§ (b<sub>2</sub> ∧ b<sub>3</sub>)
     220               âˆ§ (b<sub>4</sub> ∧ b<sub>5</sub>)
     221               âˆ§ ¬ (b<sub>6</sub> √
     222               b<sub>7</sub>) = 1</code>. Similarly, a character is numeric, <code class="code">[0-9]
     223               if and only if ¬(b<sub>0</sub> √
     224               b<sub>1</sub>) ∧ (b<sub>2</sub> ∧
     225                  b<sub>3</sub>) ∧ ¬(b<sub>4</sub>
     226               âˆ§ (b<sub>5</sub> √
     227            b<sub>6</sub>))</code>. An important observation here is that ranges of
     228            characters may require fewer operations than individual characters and
     229             multiple
     230            classes can share the classification cost. </p>
     231<p id="idp343456">
     232           
     233         </p>
     234<p id="idp347392"> Consider, for example, the XML source data stream shown in the first line of
     235            . The remaining lines of this figure show
     236            several parallel bit streams that are computed in Parabix-style parsing, with each bit
     237            of each stream in one-to-one correspondence to the source character code units of the
     238            input stream. For clarity, 1 bits are denoted with 1 in each stream and 0 bits are
     239            represented as underscores. The first bit stream shown is that for the opening angle
     240            brackets that represent tag openers in XML. The second and third streams show a
     241            partition of the tag openers into start tag marks and end tag marks depending on the
     242            character immediately following the opener (i.e., <code class="code">"/"</code>) or
     243            not. The remaining three lines show streams that can be computed in subsequent parsing
     244            (using the technique of bitstream addition \cite{cameron-EuroPar2011}), namely streams
     245            marking the element names, attribute names and attribute values of tags. </p>
     246<p id="idp37152"> Two intuitions may help explain how the Parabix approach can lead to improved XML
     247            parsing performance. The first is that the use of the full register width offers a
     248            considerable information advantage over sequential byte-at-a-time parsing. That is,
     249            sequential processing of bytes uses just 8 bits of each register, greatly limiting the
     250            processor resources that are effectively being used at any one time. The second is that
     251            byte-at-a-time loop scanning loops are actually often just computing a single bit of
     252            information per iteration: is the scan complete yet? Rather than computing these
     253            individual decision-bits, an approach that computes many of them in parallel (e.g., 128
     254            bytes at a time using 128-bit registers) should provide substantial benefit. </p>
     255<p id="idp38400"> Previous studies have shown that the Parabix approach improves many aspects of XML
     256            processing, including transcoding \cite{Cameron2008}, character classification and
     257            validation, tag parsing and well-formedness checking. The first Parabix parser used
     258            processor bit scan instructions to considerably accelerate sequential scanning loops for
     259            individual characters \cite{CameronHerdyLin2008}. Recent work has incorporated a method
     260            of parallel scanning using bitstream addition \cite{cameron-EuroPar2011}, as well as
     261            combining SIMD methods with 4-stage pipeline parallelism to further improve throughput
     262            \cite{HPCA2012}. Although these research prototypes handled the full syntax of
     263            schema-less XML documents, they lacked the functionality required by full XML parsers. </p>
     264<p id="idp39664"> Commercial XML processors support transcoding of multiple character sets and can
     265            parse and validate against multiple document vocabularies. Additionally, they provide
     266            API facilities beyond those found in research prototypes, including the widely used SAX,
     267            SAX2 and DOM interfaces. </p>
     268</div>
     269<div class="section" id="idp40512">
     270<h3 class="title" style="clear: both">Sequential vs. Parallel Paradigm</h3>
     271<p id="idp41152"> Xerces—like all traditional XML parsers—processes XML documents
     272            sequentially. Each character is examined to distinguish between the XML-specific markup,
     273            such as a left angle bracket <code class="code">&lt;</code>, and the content held within the
     274            document. As the parser progresses through the document, it alternates between markup
     275            scanning, validation and content processing modes. </p>
     276<p id="idp42720"> In other words, Xerces belongs to an equivalent class applications termed FSM
     277            applications\footnote{ Herein FSM applications are considered software systems whose
     278            behaviour is defined by the inputs, current state and the events associated with
     279            transitions of states.}. Each state transition indicates the processing context of
     280            subsequent characters. Unfortunately, textual data tends to be unpredictable and any
     281            character could induce a state transition. </p>
     282<p id="idp43632"> Parabix-style XML parsers utilize a concept of layered processing. A block of source
     283            text is transformed into a set of lexical bitstreams, which undergo a series of
     284            operations that can be grouped into logical layers, e.g., transposition, character
     285            classification, and lexical analysis. Each layer is pipeline parallel and require
     286            neither speculation nor pre-parsing stages\cite{HPCA2012}. To meet the API requirements
     287            of the document-ordered Xerces output, the results of the Parabix processing layers must
     288            be interleaved to produce the equivalent behaviour. </p>
     289</div>
     290</div>
     291<div class="section" id="idp44272">
     292<h2 class="title" style="clear: both">Architecture</h2>
     293<div class="section" id="idp366784">
     294<h3 class="title" style="clear: both">Overview</h3>
     295<p id="idp367680"> icXML is more than an optimized version of Xerces. Many components were grouped,
     296            restructured and rearchitected with pipeline parallelism in mind. In this section, we
     297            highlight the core differences between the two systems. As shown in Figure
     298            \ref{fig:xerces-arch}, Xerces is comprised of five main modules: the transcoder, reader,
     299            scanner, namespace binder, and validator. The <span class="ital">Transcoder</span> converts source data into UTF-16 before Xerces parses it as XML;
     300            the majority of the character set encoding validation is performed as a byproduct of
     301            this process. The <span class="ital">Reader</span> is responsible for the
     302            streaming and buffering of all raw and transcoded (UTF-16) text. It tracks the current
     303            line/column position,
     304           
     305            performs line-break normalization and validates context-specific character set issues,
     306            such as tokenization of qualified-names. The <span class="ital">Scanner</span>
     307            pulls data through the reader and constructs the intermediate representation (IR) of the
     308            document; it deals with all issues related to entity expansion, validates the XML
     309            well-formedness constraints and any character set encoding issues that cannot be
     310            completely handled by the reader or transcoder (e.g., surrogate characters, validation
     311            and normalization of character references, etc.) The <span class="ital">Namespace
     312               Binder</span> is a core piece of the element stack. It handles namespace scoping
     313            issues between different XML vocabularies. This allows the scanner to properly select
     314            the correct schema grammar structures. The <span class="ital">Validator</span>
     315            takes the IR produced by the Scanner (and potentially annotated by the Namespace Binder)
     316            and assesses whether the final output matches the user-defined DTD and schema grammar(s)
     317            before passing it to the end-user. </p>
     318<p id="idp373536">
     319           
     320         </p>
     321<p id="idp374400"> In icXML functions are grouped into logical components. As shown in Figure
     322            \ref{fig:icxml-arch}, two major categories exist: (1) the Parabix Subsystem and (2) the
     323            Markup Processor. All tasks in (1) use the Parabix Framework \cite{HPCA2012}, which
     324            represents data as a set of parallel bitstreams. The <span class="ital">Character Set
     325               Adapter</span>, discussed in Section \ref{arch:character-set-adapter}, mirrors
     326            Xerces's Transcoder duties; however instead of producing UTF-16 it produces a set of
     327            lexical bitstreams, similar to those shown in Figure \ref{fig:parabix1}. These lexical
     328            bitstreams are later transformed into UTF-16 in the Content Stream Generator, after
     329            additional processing is performed. The first precursor to producing UTF-16 is the
     330               <span class="ital">Parallel Markup Parser</span> phase. It takes the lexical
     331            streams and produces a set of marker bitstreams in which a 1-bit identifies significant
     332            positions within the input data. One bitstream for each of the critical piece of
     333            information is created, such as the beginning and ending of start tags, end tags,
     334            element names, attribute names, attribute values and content. Intra-element
     335            well-formedness validation is performed as an artifact of this process. Like Xerces,
     336            icXML must provide the Line and Column position of each error. The <span class="ital">Line-Column Tracker</span> uses the lexical information to keep track of the
     337            document position(s) through the use of an optimized population count algorithm,
     338            described in Section \ref{section:arch:errorhandling}. From here, two data-independent
     339            branches exist: the Symbol Resolver and Content Preparation Unit. </p>
     340<p id="idp379008"> A typical XML file contains few unique element and attribute names—but
     341            each of them will occur frequently. icXML stores these as distinct data structures,
     342            called symbols, each with their own global identifier (GID). Using the symbol marker
     343            streams produced by the Parallel Markup Parser, the <span class="ital">Symbol
     344               Resolver</span> scans through the raw data to produce a sequence of GIDs, called
     345            the <span class="ital">symbol stream</span>. </p>
     346<p id="idp381504"> The final components of the Parabix Subsystem are the <span class="ital">Content
     347               Preparation Unit</span> and <span class="ital">Content Stream
     348            Generator</span>. The former takes the (transposed) basis bitstreams and selectively
     349            filters them, according to the information provided by the Parallel Markup Parser, and
     350            the latter transforms the filtered streams into the tagged UTF-16 <span class="ital">content stream</span>, discussed in Section \ref{section:arch:contentstream}. </p>
     351<p id="idp384416"> Combined, the symbol and content stream form icXML's compressed IR of the XML
     352            document. The <span class="ital">Markup Processor</span>~parses the IR to
     353            validate and produce the sequential output for the end user. The <span class="ital">Final WF checker</span> performs inter-element well-formedness validation that
     354            would be too costly to perform in bit space, such as ensuring every start tag has a
     355            matching end tag. Xerces's namespace binding functionality is replaced by the <span class="ital">Namespace Processor</span>. Unlike Xerces, it is a discrete phase
     356            that produces a series of URI identifiers (URI IDs), the <span class="ital">URI
     357               stream</span>, which are associated with each symbol occurrence. This is
     358            discussed in Section \ref{section:arch:namespacehandling}. Finally, the <span class="ital">Validation</span> layer implements the Xerces's validator. However,
     359            preprocessing associated with each symbol greatly reduces the work of this stage. </p>
     360<p id="idp389200">
     361           
     362         </p>
     363</div>
     364<div class="section" id="idp390240">
     365<h3 class="title" style="clear: both">Character Set Adapters</h3>
     366<p id="idp390880"> In Xerces, all input is transcoded into UTF-16 to simplify the parsing costs of
     367            Xerces itself and provide the end-consumer with a single encoding format. In the
     368            important case of UTF-8 to UTF-16 transcoding, the transcoding costs can be significant,
     369            because of the need to decode and classify each byte of input, mapping variable-length
     370            UTF-8 byte sequences into 16-bit UTF-16 code units with bit manipulation operations. In
     371            other cases, transcoding may involve table look-up operations for each byte of input. In
     372            any case, transcoding imposes at least a cost of buffer copying. </p>
     373<p id="idp391936"> In icXML, however, the concept of Character Set Adapters (CSAs) is used to minimize
     374            transcoding costs. Given a specified input encoding, a CSA is responsible for checking
     375            that input code units represent valid characters, mapping the characters of the encoding
     376            into the appropriate bitstreams for XML parsing actions (i.e., producing the lexical
     377            item streams), as well as supporting ultimate transcoding requirements. All of this work
     378            is performed using the parallel bitstream representation of the source input. </p>
     379<p id="idp392912"> An important observation is that many character sets are an extension to the legacy
     380            7-bit ASCII character set. This includes the various ISO Latin character sets, UTF-8,
     381            UTF-16 and many others. Furthermore, all significant characters for parsing XML are
     382            confined to the ASCII repertoire. Thus, a single common set of lexical item calculations
     383            serves to compute lexical item streams for all such ASCII-based character sets. </p>
     384<p id="idp393792"> A second observation is that—regardless of which character set is
     385            used—quite often all of the characters in a particular block of input will be
     386            within the ASCII range. This is a very simple test to perform using the bitstream
     387            representation, simply confirming that the bit 0 stream is zero for the entire block.
     388            For blocks satisfying this test, all logic dealing with non-ASCII characters can simply
     389            be skipped. Transcoding to UTF-16 becomes trivial as the high eight bitstreams of the
     390            UTF-16 form are each set to zero in this case. </p>
     391<p id="idp395712"> A third observation is that repeated transcoding of the names of XML elements,
     392            attributes and so on can be avoided by using a look-up mechanism. That is, the first
     393            occurrence of each symbol is stored in a look-up table mapping the input encoding to a
     394            numeric symbol ID. Transcoding of the symbol is applied at this time. Subsequent look-up
     395            operations can avoid transcoding by simply retrieving the stored representation. As
     396            symbol look up is required to apply various XML validation rules, there is achieves the
     397            effect of transcoding each occurrence without additional cost. </p>
     398<p id="idp396768"> The cost of individual character transcoding is avoided whenever a block of input is
     399            confined to the ASCII subset and for all but the first occurrence of any XML element or
     400            attribute name. Furthermore, when transcoding is required, the parallel bitstream
     401            representation supports efficient transcoding operations. In the important case of UTF-8
     402            to UTF-16 transcoding, the corresponding UTF-16 bitstreams can be calculated in bit
     403            parallel fashion based on UTF-8 streams \cite{Cameron2008}, and all but the final bytes
     404            of multi-byte sequences can be marked for deletion as discussed in the following
     405            subsection. In other cases, transcoding within a block only need be applied for
     406            non-ASCII bytes, which are conveniently identified by iterating through the bit 0 stream
     407            using bit scan operations. </p>
     408</div>
     409<div class="section" id="idp398192">
     410<h3 class="title" style="clear: both">Combined Parallel Filtering</h3>
     411<p id="idp398880"> As just mentioned, UTF-8 to UTF-16 transcoding involves marking all but the last
     412            bytes of multi-byte UTF-8 sequences as positions for deletion. For example, the two
     413            Chinese characters <code class="code">䜠奜</code> are represented as two
     414            three-byte UTF-8 sequences <code class="code">E4 BD A0</code> and <code class="code">E5 A5 BD</code> while the
     415            UTF-16 representation must be compressed down to the two code units <code class="code">4F60</code>
     416            and <code class="code">597D</code>. In the bit parallel representation, this corresponds to a
     417            reduction from six bit positions representing UTF-8 code units (bytes) down to just two
     418            bit positions representing UTF-16 code units (double bytes). This compression may be
     419            achieved by arranging to calculate the correct UTF-16 bits at the final position of each
     420            sequence and creating a deletion mask to mark the first two bytes of each 3-byte
     421            sequence for deletion. In this case, the portion of the mask corresponding to these
     422            input bytes is the bit sequence <code class="code">110110</code>. Using this approach, transcoding
     423            may then be completed by applying parallel deletion and inverse transposition of the
     424            UTF-16 bitstreams\cite{Cameron2008}. </p>
     425<p id="idp403040">
     426           
     427           
     428           
     429           
     430           
     431           
     432           
     433           
     434           
     435         </p>
     436<p id="idp406976"> Rather than immediately paying the costs of deletion and transposition just for
     437            transcoding, however, icXML defers these steps so that the deletion masks for several
     438            stages of processing may be combined. In particular, this includes core XML requirements
     439            to normalize line breaks and to replace character reference and entity references by
     440            their corresponding text. In the case of line break normalization, all forms of line
     441            breaks, including bare carriage returns (CR), line feeds (LF) and CR-LF combinations
     442            must be normalized to a single LF character in each case. In icXML, this is achieved by
     443            first marking CR positions, performing two bit parallel operations to transform the
     444            marked CRs into LFs, and then marking for deletion any LF that is found immediately
     445            after the marked CR as shown by the Pablo source code in Figure
     446            \ref{fig:LBnormalization}.
     447           
     448         </p>
     449<p id="idp409616"> In essence, the deletion masks for transcoding and for line break normalization each
     450            represent a bitwise filter; these filters can be combined using bitwise-or so that the
     451            parallel deletion algorithm need only be applied once. </p>
     452<p id="idp410800"> A further application of combined filtering is the processing of XML character and
     453            entity references. Consider, for example, the references <code class="code">&amp;</code> or
     454               <code class="code">&lt;</code>. which must be replaced in XML processing with the single
     455               <code class="code">&amp;</code> and <code class="code">&lt;</code> characters, respectively. The
     456            approach in icXML is to mark all but the first character positions of each reference for
     457            deletion, leaving a single character position unmodified. Thus, for the references
     458               <code class="code">&amp;</code> or <code class="code">&lt;</code> the masks <code class="code">01111</code> and
     459               <code class="code">011111</code> are formed and combined into the overall deletion mask. After the
     460            deletion and inverse transposition operations are finally applied, a post-processing
     461            step inserts the proper character at these positions. One note about this process is
     462            that it is speculative; references are assumed to generally be replaced by a single
     463            UTF-16 code unit. In the case, that this is not true, it is addressed in
     464            post-processing. </p>
     465<p id="idp415520"> The final step of combined filtering occurs during the process of reducing markup
     466            data to tag bytes preceding each significant XML transition as described in
     467            section~\ref{section:arch:contentstream}. Overall, icXML avoids separate buffer copying
     468            operations for each of the these filtering steps, paying the cost of parallel deletion
     469            and inverse transposition only once. Currently, icXML employs the parallel-prefix
     470            compress algorithm of Steele~\cite{HackersDelight} Performance is independent of the
     471            number of positions deleted. Future versions of icXML are expected to take advantage of
     472            the parallel extract operation~\cite{HilewitzLee2006} that Intel is now providing in its
     473            Haswell architecture. </p>
     474</div>
     475<div class="section" id="idp416848">
     476<h3 class="title" style="clear: both">Content Stream</h3>
     477<p id="idp417520"> A relatively-unique concept for icXML is the use of a filtered content stream.
     478            Rather that parsing an XML document in its original format, the input is transformed
     479            into one that is easier for the parser to iterate through and produce the sequential
     480            output. In , the source data
     481           
     482            is transformed into
     483           
     484            through the parallel filtering algorithm, described in section \ref{sec:parfilter}. </p>
     485<p id="idp419920"> Combined with the symbol stream, the parser traverses the content stream to
     486            effectively reconstructs the input document in its output form. The initial <span class="ital">0</span> indicates an empty content string. The following
     487               <code class="code">&gt;</code> indicates that a start tag without any attributes is the first
     488            element in this text and the first unused symbol, <code class="code">document</code>, is the element
     489            name. Succeeding that is the content string <code class="code">fee</code>, which is null-terminated
     490            in accordance with the Xerces API specification. Unlike Xerces, no memory-copy
     491            operations are required to produce these strings, which as
     492            Figure~\ref{fig:xerces-profile} shows accounts for 6.83% of Xerces's execution time.
     493            Additionally, it is cheap to locate the terminal character of each string: using the
     494            String End bitstream, the Parabix Subsystem can effectively calculate the offset of each
     495            null character in the content stream in parallel, which in turn means the parser can
     496            directly jump to the end of every string without scanning for it. </p>
     497<p id="idp423312"> Following <code class="code">'fee'</code> is a <code class="code">=</code>, which marks the
     498            existence of an attribute. Because all of the intra-element was performed in the Parabix
     499            Subsystem, this must be a legal attribute. Since attributes can only occur within start
     500            tags and must be accompanied by a textual value, the next symbol in the symbol stream
     501            must be the element name of a start tag, and the following one must be the name of the
     502            attribute and the string that follows the <code class="code">=</code> must be its value. However, the
     503            subsequent <code class="code">=</code> is not treated as an independent attribute because the parser
     504            has yet to read a <code class="code">&gt;</code>, which marks the end of a start tag. Thus only
     505            one symbol is taken from the symbol stream and it (along with the string value) is added
     506            to the element. Eventually the parser reaches a <code class="code">/</code>, which marks the
     507            existence of an end tag. Every end tag requires an element name, which means they
     508            require a symbol. Inter-element validation whenever an empty tag is detected to ensure
     509            that the appropriate scope-nesting rules have been applied. </p>
     510</div>
     511<div class="section" id="idp427456">
     512<h3 class="title" style="clear: both">Namespace Handling</h3>
     513<p id="idp428544"> In XML, namespaces prevents naming conflicts when multiple vocabularies are used
     514            together. It is especially important when a vocabulary application-dependant meaning,
     515            such as when XML or SVG documents are embedded within XHTML files. Namespaces are bound
     516            to uniform resource identifiers (URIs), which are strings used to identify specific
     517            names or resources. On line 1 of Figure \ref{fig:namespace1}, the <code class="code">xmlns</code>
     518            attribute instructs the XML processor to bind the prefix <code class="code">p</code> to the URI
     519               '<code class="code">pub.net</code>' and the default (empty) prefix to
     520               <code class="code">book.org</code>. Thus to the XML processor, the <code class="code">title</code> on line 2
     521            and <code class="code">price</code> on line 4 both read as
     522            <code class="code">"book.org":title</code> and
     523               <code class="code">"book.org":price</code> respectively, whereas on line 3 and
     524            5, <code class="code">p:name</code> and <code class="code">price</code> are seen as
     525               <code class="code">"pub.net":name</code> and
     526               <code class="code">"pub.net":price</code>. Even though the actual element name
     527               <code class="code">price</code>, due to namespace scoping rules they are viewed as two
     528            uniquely-named items because the current vocabulary is determined by the namespace(s)
     529            that are in-scope. </p>
     530<p id="idp435712">
     531           
     532         </p>
     533<p id="idp436256"> In both Xerces and icXML, every URI has a one-to-one mapping to a URI ID. These
     534            persist for the lifetime of the application through the use of a global URI pool. Xerces
     535            maintains a stack of namespace scopes that is pushed (popped) every time a start tag
     536            (end tag) occurs in the document. Because a namespace declaration affects the entire
     537            element, it must be processed prior to grammar validation. This is a costly process
     538            considering that a typical namespaced XML document only comes in one of two forms: (1)
     539            those that declare a set of namespaces upfront and never change them, and (2) those that
     540            repeatedly modify the namespaces in predictable patterns. </p>
     541<p id="idp438720"> For that reason, icXML contains an independent namespace stack and utilizes bit
     542            vectors to cheaply perform
     543             When a prefix is
     544            declared (e.g., <code class="code">xmlns:p="pub.net"</code>), a namespace binding
     545            is created that maps the prefix (which are assigned Prefix IDs in the symbol resolution
     546            process) to the URI. Each unique namespace binding has a unique namespace id (NSID) and
     547            every prefix contains a bit vector marking every NSID that has ever been associated with
     548            it within the document. For example, in Table \ref{tbl:namespace1}, the prefix binding
     549            set of <code class="code">p</code> and <code class="code">xmlns</code> would be <code class="code">01</code> and
     550            <code class="code">11</code> respectively. To resolve the in-scope namespace binding for each prefix,
     551            a bit vector of the currently visible namespaces is maintained by the system. By ANDing
     552            the prefix bit vector with the currently visible namespaces, the in-scope NSID can be
     553            found using a bit-scan intrinsic. A namespace binding table, similar to Table
     554            \ref{tbl:namespace1}, provides the actual URI ID. </p>
     555<p id="idp443152">
     556           
     557         </p>
     558<p id="idp443696">
     559           
     560           
     561           
     562           
     563         </p>
     564<p id="idp446144"> To ensure that scoping rules are adhered to, whenever a start tag is encountered,
     565            any modification to the currently visible namespaces is calculated and stored within a
     566            stack of bit vectors denoting the locally modified namespace bindings. When an end tag
     567            is found, the currently visible namespaces is XORed with the vector at the top of the
     568            stack. This allows any number of changes to be performed at each scope-level with a
     569            constant time.
     570           
     571         </p>
     572</div>
     573<div class="section" id="idp447600">
     574<h3 class="title" style="clear: both">Error Handling</h3>
     575<p id="idp448272">
     576           
     577            Xerces outputs error messages in two ways: through the programmer API and as thrown
     578            objects for fatal errors. As Xerces parses a file, it uses context-dependant logic to
     579            assess whether the next character is legal; if not, the current state determines the
     580            type and severity of the error. icXML emits errors in the similar manner—but
     581            how it discovers them is substantially different. Recall that in Figure
     582            \ref{fig:icxml-arch}, icXML is divided into two sections: the Parabix Subsystem and
     583            Markup Processor, each with its own system for detecting and producing error messages. </p>
     584<p id="idp449904"> Within the Parabix Subsystem, all computations are performed in parallel, a block at
     585            a time. Errors are derived as artifacts of bitstream calculations, with a 1-bit marking
     586            the byte-position of an error within a block, and the type of error is determined by the
     587            equation that discovered it. The difficulty of error processing in this section is that
     588            in Xerces the line and column number must be given with every error production. Two
     589            major issues exist because of this: (1) line position adheres to XML white-normalization
     590            rules; as such, some sequences of characters, e.g., a carriage return followed by a line
     591            feed, are counted as a single new line character. (2) column position is counted in
     592            characters, not bytes or code units; thus multi-code-unit code-points and surrogate
     593            character pairs are all counted as a single column position. Note that typical XML
     594            documents are error-free but the calculation of the line/column position is a constant
     595            overhead in Xerces.  To
     596            reduce this, icXML pushes the bulk cost of the line/column calculation to the occurrence
     597            of the error and performs the minimal amount of book-keeping necessary to facilitate it.
     598            icXML leverages the byproducts of the Character Set Adapter (CSA) module and amalgamates
     599            the information within the Line Column Tracker (LCT). One of the CSA's major
     600            responsibilities is transcoding an input text.
     601             During this process,
     602            white-space normalization rules are applied and multi-code-unit and surrogate characters
     603            are detected and validated. A <span class="ital">line-feed bitstream</span>,
     604            which marks the positions of the normalized new lines characters, is a natural
     605            derivative of this process. Using an optimized population count algorithm, the line
     606            count can be summarized cheaply for each valid block of text.
     607             Column position is more
     608            difficult to calculate. It is possible to scan backwards through the bitstream of new
     609            line characters to determine the distance (in code-units) between the position between
     610            which an error was detected and the last line feed. However, this distance may exceed
     611            than the actual character position for the reasons discussed in (2). To handle this, the
     612            CSA generates a <span class="ital">skip mask</span> bitstream by ORing together
     613            many relevant bitstreams, such as all trailing multi-code-unit and surrogate characters,
     614            and any characters that were removed during the normalization process. When an error is
     615            detected, the sum of those skipped positions is subtracted from the distance to
     616            determine the actual column number. </p>
     617<p id="idp455392"> The Markup Processor is a state-driven machine. As such, error detection within it
     618            is very similar to Xerces. However, reporting the correct line/column is a much more
     619            difficult problem. The Markup Processor parses the content stream, which is a series of
     620            tagged UTF-16 strings. Each string is normalized in accordance with the XML
     621            specification. All symbol data and unnecessary whitespace is eliminated from the stream;
     622            thus its impossible to derive the current location using only the content stream. To
     623            calculate the location, the Markup Processor borrows three additional pieces of
     624            information from the Parabix Subsystem: the line-feed, skip mask, and a <span class="ital">deletion mask stream</span>, which is a bitstream denoting the
     625            (code-unit) position of every datum that was suppressed from the source during the
     626            production of the content stream. Armed with these, it is possible to calculate the
     627            actual line/column using the same system as the Parabix Subsystem until the sum of the
     628            negated deletion mask stream is equal to the current position. </p>
     629</div>
     630</div>
     631<div class="section" id="idp458640">
     632<h2 class="title" style="clear: both">Multithreading with Pipeline Parallelism</h2>
     633<p id="idp459280"> As discussed in section \ref{background:xerces}, Xerces can be considered a FSM
     634         application. These are "embarrassingly
     635         sequential."\cite{Asanovic:EECS-2006-183} and notoriously difficult to
     636         parallelize. However, icXML is designed to organize processing into logical layers. In
     637         particular, layers within the Parabix Subsystem are designed to operate over significant
     638         segments of input data before passing their outputs on for subsequent processing. This fits
     639         well into the general model of pipeline parallelism, in which each thread is in charge of a
     640         single module or group of modules. </p>
     641<p id="idp461136"> The most straightforward division of work in icXML is to separate the Parabix Subsystem
     642         and the Markup Processor into distinct logical layers into two separate stages. The
     643         resultant application, <span class="ital">icXML-p</span>, is a course-grained
     644         software-pipeline application. In this case, the Parabix Subsystem thread
     645               <code class="code">T<sub>1</sub></code> reads 16k of XML input <code class="code">I</code> at a
     646         time and produces the content, symbol and URI streams, then stores them in a pre-allocated
     647         shared data structure <code class="code">S</code>. The Markup Processor thread
     648            <code class="code">T<sub>2</sub></code> consumes <code class="code">S</code>, performs well-formedness
     649         and grammar-based validation, and the provides parsed XML data to the application through
     650         the Xerces API. The shared data structure is implemented using a ring buffer, where every
     651         entry contains an independent set of data streams. In the examples of Figure
     652         \ref{threads_timeline1} and \ref{threads_timeline2}, the ring buffer has four entries. A
     653         lock-free mechanism is applied to ensure that each entry can only be read or written by one
     654         thread at the same time. In Figure \ref{threads_timeline1} the processing time of
     655               <code class="code">T<sub>1</sub></code> is longer than
     656         <code class="code">T<sub>2</sub></code>; thus <code class="code">T<sub>2</sub></code> always
     657         waits for <code class="code">T<sub>1</sub></code> to write to the shared memory. Figure
     658         \ref{threads_timeline2} illustrates the scenario in which
     659         <code class="code">T<sub>1</sub></code> is faster and must wait for
     660            <code class="code">T<sub>2</sub></code> to finish reading the shared data before it can
     661         reuse the memory space. </p>
     662<p id="idp470208">
     663         
     664      </p>
     665<p id="idp470752"> Overall, our design is intended to benefit a range of applications. Conceptually, we
     666         consider two design points. The first, the parsing performed by the Parabix Subsystem
     667         dominates at 67% of the overall cost, with the cost of application processing (including
     668         the driver logic within the Markup Processor) at 33%. The second is almost the opposite
     669         scenario, the cost of application processing dominates at 60%, while the cost of XML
     670         parsing represents an overhead of 40%. </p>
     671<p id="idp472672"> Our design is predicated on a goal of using the Parabix framework to achieve a 50% to
     672         100% improvement in the parsing engine itself. In a best case scenario, a 100% improvement
     673         of the Parabix Subsystem for the design point in which XML parsing dominates at 67% of the
     674         total application cost. In this case, the single-threaded icXML should achieve a 1.5x
     675         speedup over Xerces so that the total application cost reduces to 67% of the original.
     676         However, in icXML-p, our ideal scenario gives us two well-balanced threads each performing
     677         about 33% of the original work. In this case, Amdahl's law predicts that we could expect up
     678         to a 3x speedup at best. </p>
     679<p id="idp473792"> At the other extreme of our design range, we consider an application in which core
     680         parsing cost is 40%. Assuming the 2x speedup of the Parabix Subsystem over the
     681         corresponding Xerces core, single-threaded icXML delivers a 25% speedup. However, the most
     682         significant aspect of our two-stage multi-threaded design then becomes the ability to hide
     683         the entire latency of parsing within the serial time required by the application. In this
     684         case, we achieve an overall speedup in processing time by 1.67x. </p>
     685<p id="idp474736"> Although the structure of the Parabix Subsystem allows division of the work into
     686         several pipeline stages and has been demonstrated to be effective for four pipeline stages
     687         in a research prototype \cite{HPCA2012}, our analysis here suggests that the further
     688         pipelining of work within the Parabix Subsystem is not worthwhile if the cost of
     689         application logic is little as 33% of the end-to-end cost using Xerces. To achieve benefits
     690         of further parallelization with multi-core technology, there would need to be reductions in
     691         the cost of application logic that could match reductions in core parsing cost. </p>
     692</div>
     693<div class="section" id="idp475920">
     694<h2 class="title" style="clear: both">Performance</h2>
     695<p id="idp476592"> We evaluate Xerces-C++ 3.1.1, icXML, icXML-p against two benchmarking applications: the
     696         Xerces C++ SAXCount sample application, and a real world GML to SVG transformation
     697         application. We investigated XML parser performance using an Intel Core i7 quad-core (Sandy
     698         Bridge) processor (3.40GHz, 4 physical cores, 8 threads (2 per core), 32+32 kB (per core)
     699         L1 cache, 256 kB (per core) L2 cache, 8 MB L3 cache) running the 64-bit version of Ubuntu
     700         12.04 (Linux). </p>
     701<p id="idp477504"> We analyzed the execution profiles of each XML parser using the performance counters
     702         found in the processor. We chose several key hardware events that provide insight into the
     703         profile of each application and indicate if the processor is doing useful work. The set of
     704         events included in our study are: processor cycles, branch instructions, branch
     705         mispredictions, and cache misses. The Performance Application Programming Interface (PAPI)
     706         Version 5.5.0 \cite{papi} toolkit was installed on the test system to facilitate the
     707         collection of hardware performance monitoring statistics. In addition, we used the Linux
     708         perf \cite{perf} utility to collect per core hardware events. </p>
     709<div class="section" id="idp478640">
     710<h3 class="title" style="clear: both">Xerces C++ SAXCount</h3>
     711<p id="idp479312"> Xerces comes with sample applications that demonstrate salient features of the
     712            parser. SAXCount is the simplest such application: it counts the elements, attributes
     713            and characters of a given XML file using the (event based) SAX API and prints out the
     714            totals. </p>
     715<p id="idp480016"> Table \ref{XMLDocChars} shows the document characteristics of the XML input files
     716            selected for the Xerces C++ SAXCount benchmark. The jaw.xml represents document-oriented
     717            XML inputs and contains the three-byte and four-byte UTF-8 sequence required for the
     718            UTF-8 encoding of Japanese characters. The remaining data files are data-oriented XML
     719            documents and consist entirely of single byte encoded ASCII characters.
     720  <div class="table-wrapper" id="idp480752">
     721<p class="title">Table II</p>
     722<div class="caption"><p id="idp481264">XML Document Characteristics</p></div>
     723<table class="table">
     724<colgroup span="1">
     725<col align="left" valign="top" span="1">
     726<col align="left" valign="top" span="1">
     727<col align="left" valign="top" span="1">
     728<col align="left" valign="top" span="1">
     729<col align="left" valign="top" span="1">
     730</colgroup>
     731<tbody>
     732<tr>
     733<td>File Name           </td>
     734<td> jaw.xml            </td>
     735<td> road.gml   </td>
     736<td> po.xml     </td>
     737<td> soap.xml </td>
     738</tr>
     739<tr>
     740<td>File Type           </td>
     741<td> document           </td>
     742<td> data               </td>
     743<td> data               </td>
     744<td> data        </td>
     745</tr>
     746<tr>
     747<td>File Size (kB)              </td>
     748<td> 7343                       </td>
     749<td> 11584      </td>
     750<td> 76450              </td>
     751<td> 2717 </td>
     752</tr>
     753<tr>
     754<td>Markup Item Count   </td>
     755<td> 74882              </td>
     756<td> 280724     </td>
     757<td> 4634110    </td>
     758<td> 18004 </td>
     759</tr>
     760<tr>
     761<td>Markup Density              </td>
     762<td> 0.13                       </td>
     763<td> 0.57       </td>
     764<td> 0.76               </td>
     765<td> 0.87       </td>
     766</tr>
     767</tbody>
     768</table>
     769</div>           
    475770</p>
    476 <p id="idp534960">
    477 
    478 
    479 
    480 Xerces, like all traditional parsers, processes XML documents sequentially a byte-at-a-time from the
    481 first to the last byte of input data. Each byte passes through several processing layers and is
    482 classified and eventually validated within the context of the document state.
    483 This introduces implicit dependencies between the various tasks within the application that make it
    484 difficult to optimize for performance.
    485 As a complex software system, no one feature dominates the overall parsing performance.
    486 Figure \ref{fig:xerces-profile} shows the execution time profile of the top ten functions in a typical run.
    487 Even if it were possible, Amdahl's Law dictates that tackling any one of these functions for
    488 parallelization in isolation would only produce a minute improvement in performance.
    489 Unfortunately, early investigation into these functions found
    490 that incorporating speculation-free thread-level parallelization was impossible
    491 and they were already performing well in their given tasks;
    492 thus only trivial enhancements were attainable.
    493 In order to obtain a systematic acceleration of Xerces,
    494 it should be expected that a comprehensive restructuring
    495 is required, involving all aspects of the parser.
    496 </p>
    497 <p id="idp538344">
    498 
    499 
    500 
    501 
    502 </p>
    503 </div>
    504 <div class="section" id="idp539296">
    505 <h3 class="title" style="clear: both">The Parabix Framework</h3>
    506 <p id="idp539616">
    507 The Parabix (parallel bit stream) framework is a transformative approach to XML parsing
    508 (and other forms of text processing.) The key idea is to exploit the availability of wide
    509 SIMD registers (e.g., 128-bit) in commodity processors to represent data from long blocks
    510 of input data by using one register bit per single input byte.
    511 To facilitate this, the input data is first transposed into a set of basis bit streams.
    512 In
    513 
    514 Boolean-logic operations\footnote{∧, \√ and ¬ denote the boolean AND, OR and NOT operators.}
    515 are used to classify the input bits into a set of <span class="ital">character-class bit streams</span>, which identify key
    516 characters (or groups of characters) with a <code class="code">1</code>.
    517 For example, one of the fundamental characters in XML is a left-angle bracket.
    518 A character is an <code class="code">'&lt;' if and only if ¬(b<sub>0</sub> √ b<sub>1</sub>) ∧ (b<sub>2</sub> ∧
    519 b<sub>3</sub>) ∧ (b<sub>4</sub> ∧ b<sub>5</sub>) ∧ ¬ (b<sub>6</sub> √ b<sub>7</sub>) = 1</code>.
    520 Similarly, a character is numeric,
    521 <code class="code">[0-9] if and only if ¬(b<sub>0</sub> √ b<sub>1</sub>) ∧ (b<sub>2</sub> ∧ b<sub>3</sub>) ∧ ¬(b<sub>4</sub> ∧ (b<sub>5</sub> √ b<sub>6</sub>))</code>.
    522 An important observation here is that ranges of characters may
    523 require fewer operations than individual characters and
    524 
    525 multiple classes can share the classification cost.
    526 </p>
    527 <p id="idp682296">
    528 
    529 </p>
    530 <p id="idp683752">
    531 Consider, for example, the XML source data stream shown in the first line of .
    532 The remaining lines of this figure show several parallel bit streams that are computed in Parabix-style
    533 parsing, with each bit of each stream in one-to-one correspondence to the source character code units
    534 of the input stream.
    535 For clarity, 1 bits are denoted with 1 in each stream and 0 bits are represented as underscores.
    536 The first bit stream shown is that for the opening
    537 angle brackets that represent tag openers in XML.
    538 The second and third streams show a partition of the
    539 tag openers into start tag marks and end tag marks
    540 depending on the character immediately following the
    541 opener (i.e., <code class="code">"/"</code>) or not.  The remaining three
    542 lines show streams that can be computed in subsequent
    543 parsing (using the technique
    544 of bitstream addition \cite{cameron-EuroPar2011}), namely streams marking the element names,
    545 attribute names and attribute values of tags. 
    546 </p>
    547 <p id="idp687088">
    548 Two intuitions may help explain how the Parabix approach can lead
    549 to improved XML parsing performance. The first is that
    550 the use of the full register width offers a considerable
    551 information advantage over sequential byte-at-a-time
    552 parsing.  That is, sequential processing of bytes
    553 uses just 8 bits of each register, greatly limiting the
    554 processor resources that are effectively being used at any one time.
    555 The second is that byte-at-a-time loop scanning loops are actually
    556 often just computing a single bit of information per iteration:
    557 is the scan complete yet?
    558 Rather than computing these individual decision-bits, an approach that computes
    559 many of them in parallel (e.g., 128 bytes at a time using 128-bit registers)
    560 should provide substantial benefit.
    561 </p>
    562 <p id="idp688048">
    563 Previous studies have shown that the Parabix approach improves many aspects of XML processing,
    564 including transcoding \cite{Cameron2008}, character classification and validation,
    565 tag parsing and well-formedness checking. 
    566 The first Parabix parser used processor bit scan instructions to considerably accelerate
    567 sequential scanning loops for individual characters \cite{CameronHerdyLin2008}.
    568 Recent work has incorporated a method of parallel
    569 scanning using bitstream addition \cite{cameron-EuroPar2011}, as
    570 well as combining SIMD methods with 4-stage pipeline parallelism to further improve
    571 throughput \cite{HPCA2012}.
    572 Although these research prototypes handled the full syntax of schema-less XML documents,
    573 they lacked the functionality required by full XML parsers.
    574 </p>
    575 <p id="idp689800">
    576 Commercial XML processors support transcoding of multiple character sets and can parse and
    577 validate against multiple document vocabularies.
    578 Additionally, they provide API facilities beyond those found in research prototypes,
    579 including the widely used SAX, SAX2 and DOM interfaces.
    580 </p>
    581 </div>
    582 <div class="section" id="idp690352">
    583 <h3 class="title" style="clear: both">Sequential vs. Parallel Paradigm</h3>
    584 <p id="idp690672">
    585 Xerces—like all traditional XML parsers—processes XML documents sequentially.
    586 Each character is examined to distinguish between the
    587 XML-specific markup, such as a left angle bracket <code class="code">&lt;</code>, and the
    588 content held within the document. 
    589 As the parser progresses through the document, it alternates between markup scanning,
    590 validation and content processing modes.
    591 </p>
    592 <p id="idp691744">
    593 In other words, Xerces belongs to an equivalent class applications termed FSM applications\footnote{
    594   Herein FSM applications are considered software systems whose behaviour is defined by the inputs,
    595   current state and the events associated with transitions of states.}.
    596 Each state transition indicates the processing context of subsequent characters.
    597 Unfortunately, textual data tends to be unpredictable and any character could induce a state transition.
    598 </p>
    599 <p id="idp692408">
    600 Parabix-style XML parsers utilize a concept of layered processing.
    601 A block of source text is transformed into a set of lexical bitstreams,
    602 which undergo a series of operations that can be grouped into logical layers,
    603 e.g., transposition, character classification, and lexical analysis.
    604 Each layer is pipeline parallel and require neither speculation nor pre-parsing stages\cite{HPCA2012}.
    605 To meet the API requirements of the document-ordered Xerces output,
    606 the results of the Parabix processing layers must be interleaved to produce the equivalent behaviour.
    607 </p>
    608 </div>
    609 </div>
    610 <div class="section" id="idp693304">
    611 <h2 class="title" style="clear: both">Architecture</h2>
    612 <div class="section" id="idp693624">
    613 <h3 class="title" style="clear: both">Overview</h3>
    614 <p id="idp694072">
    615 icXML is more than an optimized version of Xerces. Many components were grouped, restructured and
    616 rearchitected with pipeline parallelism in mind.
    617 In this section, we highlight the core differences between the two systems.
    618 As shown in Figure \ref{fig:xerces-arch}, Xerces
    619 is comprised of five main modules: the transcoder, reader, scanner, namespace binder, and validator.
    620 The <span class="ital">Transcoder</span> converts source data into UTF-16 before Xerces parses it as XML;
    621 the majority of the character set encoding validation is performed as a byproduct of this process.
    622 The <span class="ital">Reader</span> is responsible for the streaming and buffering of all raw and transcoded (UTF-16) text.
    623 It tracks the current line/column position,
    624 
    625 performs line-break normalization and validates context-specific character set issues,
    626 such as tokenization of qualified-names.
    627 The <span class="ital">Scanner</span> pulls data through the reader and constructs the intermediate representation (IR)
    628 of the document; it deals with all issues related to entity expansion, validates
    629 the XML well-formedness constraints and any character set encoding issues that cannot
    630 be completely handled by the reader or transcoder (e.g., surrogate characters, validation
    631 and normalization of character references, etc.)
    632 The <span class="ital">Namespace Binder</span> is a core piece of the element stack.
    633 It handles namespace scoping issues between different XML vocabularies.
    634 This allows the scanner to properly select the correct schema grammar structures.
    635 The <span class="ital">Validator</span> takes the IR produced by the Scanner (and
    636 potentially annotated by the Namespace Binder) and assesses whether the final output matches
    637 the user-defined DTD and schema grammar(s) before passing it to the end-user.
    638 </p>
    639 <p id="idp697696">
    640 
    641 </p>
    642 <p id="idp697952">
    643 In icXML functions are grouped into logical components.
    644 As shown in Figure \ref{fig:icxml-arch}, two major categories exist: (1) the Parabix Subsystem and (2) the Markup Processor.
    645 All tasks in (1) use the Parabix Framework \cite{HPCA2012}, which represents data as a set of parallel bitstreams.
    646 The <span class="ital">Character Set Adapter</span>, discussed in Section \ref{arch:character-set-adapter},
    647 mirrors Xerces's Transcoder duties; however instead of producing UTF-16 it produces a
    648 set of lexical bitstreams, similar to those shown in Figure \ref{fig:parabix1}.
    649 These lexical bitstreams are later transformed into UTF-16 in the Content Stream Generator,
    650 after additional processing is performed.
    651 The first precursor to producing UTF-16 is the <span class="ital">Parallel Markup Parser</span> phase.
    652 It takes the lexical streams and produces a set of marker bitstreams in which a 1-bit identifies
    653 significant positions within the input data. One bitstream for each of the critical piece of information is created, such as
    654 the beginning and ending of start tags, end tags, element names, attribute names, attribute values and content.
    655 Intra-element well-formedness validation is performed as an artifact of this process.
    656 Like Xerces, icXML must provide the Line and Column position of each error.
    657 The <span class="ital">Line-Column Tracker</span> uses the lexical information to keep track of the document position(s) through the use of an
    658 optimized population count algorithm, described in Section \ref{section:arch:errorhandling}.
    659 From here, two data-independent branches exist: the Symbol Resolver and Content Preparation Unit.
    660 </p>
    661 <p id="idp700912">
    662 A typical XML file contains few unique element and attribute names—but each of them will occur frequently.
    663 icXML stores these as distinct data structures, called symbols, each with their own global identifier (GID).
    664 Using the symbol marker streams produced by the Parallel Markup Parser, the <span class="ital">Symbol Resolver</span> scans through
    665 the raw data to produce a sequence of GIDs, called the <span class="ital">symbol stream</span>.
    666 </p>
    667 <p id="idp702552">
    668 The final components of the Parabix Subsystem are the <span class="ital">Content Preparation Unit</span> and <span class="ital">Content Stream Generator</span>.
    669 The former takes the (transposed) basis bitstreams and selectively filters them, according to the
    670 information provided by the Parallel Markup Parser, and the latter transforms the
    671 filtered streams into the tagged UTF-16 <span class="ital">content stream</span>, discussed in Section \ref{section:arch:contentstream}.
    672 </p>
    673 <p id="idp704040">
    674 Combined, the symbol and content stream form icXML's compressed IR of the XML document.
    675 The <span class="ital">Markup Processor</span>~parses the IR to validate and produce the sequential output for the end user.
    676 The <span class="ital">Final WF checker</span> performs inter-element well-formedness validation that would be too costly
    677 to perform in bit space, such as ensuring every start tag has a matching end tag.
    678 Xerces's namespace binding functionality is replaced by the <span class="ital">Namespace Processor</span>. Unlike Xerces,
    679 it is a discrete phase that produces a series of URI identifiers (URI IDs), the <span class="ital">URI stream</span>, which are
    680 associated with each symbol occurrence.
    681 This is discussed in Section \ref{section:arch:namespacehandling}.
    682 Finally, the <span class="ital">Validation</span> layer implements the Xerces's validator.
    683 However, preprocessing associated with each symbol greatly reduces the work of this stage.
    684 </p>
    685 <p id="idp706600">
    686 
    687 </p>
    688 </div>
    689 <div class="section" id="idp706920">
    690 <h3 class="title" style="clear: both">Character Set Adapters</h3>
    691 <p id="idp707568">
    692 In Xerces, all input is transcoded into UTF-16 to simplify the parsing costs of Xerces itself and
    693 provide the end-consumer with a single encoding format.
    694 In the important case of UTF-8 to UTF-16 transcoding, the transcoding costs can be significant,
    695 because of the need to decode and classify each byte of input, mapping variable-length UTF-8
    696 byte sequences into 16-bit UTF-16 code units with bit manipulation operations.   
    697 In other cases, transcoding may involve table look-up operations for each byte of input.  In any case,
    698 transcoding imposes at least a cost of buffer copying.
    699 </p>
    700 <p id="idp708352">
    701 In icXML, however,  the concept of Character Set Adapters (CSAs) is used to minimize transcoding costs.
    702 Given a specified input encoding, a CSA is responsible for checking that
    703 input code units represent valid characters, mapping the characters of the encoding into
    704 the appropriate bitstreams for XML parsing actions (i.e., producing the lexical item
    705 streams), as well as supporting ultimate transcoding requirements.   All of this work
    706 is performed using the parallel bitstream representation of the source input.
    707 </p>
    708 <p id="idp709072">
    709 An important observation is that many character sets are an
    710 extension to the legacy 7-bit ASCII character set.  This includes the
    711 various ISO Latin character sets, UTF-8, UTF-16 and many others.
    712 Furthermore, all significant characters for parsing XML are confined to the
    713 ASCII repertoire.   Thus, a single common set of lexical item calculations
    714 serves to compute lexical item streams for all such ASCII-based character sets.
    715 </p>
    716 <p id="idp709704">
    717 A second observation is that—regardless of which character set is used—quite
    718 often all of the characters in a particular block of input will be within the ASCII range.
    719 This is a very simple test to perform using the bitstream representation, simply confirming that the
    720 bit 0 stream is zero for the entire block.   For blocks satisfying this test,
    721 all logic dealing with non-ASCII characters can simply be skipped.
    722 Transcoding to UTF-16 becomes trivial as the high eight bitstreams of the
    723 UTF-16 form are each set to zero in this case.
    724 </p>
    725 <p id="idp711256">
    726 A third observation is that repeated transcoding of the names of XML
    727 elements, attributes and so on can be avoided by using a look-up mechanism.
    728 That is, the first occurrence of each symbol is stored in a look-up
    729 table mapping the input encoding to a numeric symbol ID.   Transcoding
    730 of the symbol is applied at this time.  Subsequent look-up operations
    731 can avoid transcoding by simply retrieving the stored representation.
    732 As symbol look up is required to apply various XML validation rules,
    733 there is achieves the effect of transcoding each occurrence without
    734 additional cost.
    735 </p>
    736 <p id="idp712040">
    737 The cost of individual character transcoding is avoided whenever a block of input is
    738 confined to the ASCII subset and for all but the first occurrence of any XML element or attribute name.
    739 Furthermore, when transcoding is required, the parallel bitstream representation
    740 supports efficient transcoding operations.   
    741 In the important case of UTF-8 to UTF-16 transcoding, the corresponding UTF-16 bitstreams
    742 can be calculated in bit parallel fashion based on UTF-8 streams \cite{Cameron2008},
    743 and all but the final bytes of multi-byte sequences can be marked for deletion as
    744 discussed in the following subsection.
    745 In other cases, transcoding within a block only need be applied for non-ASCII
    746 bytes, which are conveniently identified by iterating through the bit 0 stream
    747 using bit scan operations.
    748 </p>
    749 </div>
    750 <div class="section" id="idp509944">
    751 <h3 class="title" style="clear: both">Combined Parallel Filtering</h3>
    752 <p id="idp510296">
    753 As just mentioned, UTF-8 to UTF-16 transcoding involves marking
    754 all but the last bytes of multi-byte UTF-8 sequences as
    755 positions for deletion.   For example, the two
    756 Chinese characters <code class="code">䜠奜</code>
    757 are represented as two three-byte UTF-8 sequences <code class="code">E4 BD A0</code>
    758 and <code class="code">E5 A5 BD</code> while the UTF-16 representation must be
    759 compressed down to the two code units <code class="code">4F60</code> and <code class="code">597D</code>.
    760 In the bit parallel representation, this corresponds to a reduction
    761 from six bit positions representing UTF-8 code units (bytes)
    762 down to just two bit positions representing UTF-16 code units
    763 (double bytes).   This compression may be achieved by
    764 arranging to calculate the correct UTF-16 bits at the
    765 final position of each sequence and creating a deletion
    766 mask to mark the first two bytes of each 3-byte sequence
    767 for deletion.   In this case, the portion of the mask
    768 corresponding to these input bytes is the bit sequence
    769 <code class="code">110110</code>.  Using this approach, transcoding may then be
    770 completed by applying parallel deletion and inverse transposition of the
    771 UTF-16 bitstreams\cite{Cameron2008}.
    772 </p>
    773 <p id="idp512784">
    774 
    775 
    776 
    777 
    778 
    779 
    780 
    781 
    782 
    783 </p>
    784 <p id="idp515240">
    785 Rather than immediately paying the
    786 costs of deletion and transposition just for transcoding,
    787 however, icXML defers these steps so that the deletion
    788 masks for several stages of processing may be combined.
    789 In particular, this includes core XML requirements
    790 to normalize line breaks and to replace character
    791 reference and entity references by their corresponding
    792 text.   In the case of line break normalization,
    793 all forms of line breaks, including bare carriage
    794 returns (CR), line feeds (LF) and CR-LF combinations
    795 must be normalized to a single LF character in
    796 each case.   In icXML, this is achieved by
    797 first marking CR positions, performing two
    798 bit parallel operations to transform the marked
    799 CRs into LFs, and then marking for deletion any
    800 LF that is found immediately after the marked CR
    801 as shown by the Pablo source code in Figure \ref{fig:LBnormalization}.
    802 
    803 </p>
    804 <p id="idp517744">
    805 In essence, the deletion masks for transcoding and
    806 for line break normalization each represent a bitwise
    807 filter; these filters can be combined using bitwise-or
    808 so that the parallel deletion algorithm need only be
    809 applied once.
    810 </p>
    811 <p id="idp517936">
    812 A further application of combined filtering
    813 is the processing of XML character and entity
    814 references.   Consider, for example, the references <code class="code">&amp;</code> or <code class="code">&lt;</code>.
    815 which must be replaced in XML processing with 
    816 the single <code class="code">&amp;</code> and <code class="code">&lt;</code> characters, respectively.
    817 The approach in icXML is to mark all but the first character
    818 positions of each reference for deletion, leaving a
    819 single character position unmodified.  Thus, for the
    820 references <code class="code">&amp;</code> or <code class="code">&lt;</code> the
    821 masks <code class="code">01111</code> and <code class="code">011111</code> are formed and
    822 combined into the overall deletion mask.   After the
    823 deletion and inverse transposition operations are finally
    824 applied, a post-processing step inserts the proper character
    825 at these positions.   One note about this process is
    826 that it is speculative; references are assumed to generally
    827 be replaced by a single UTF-16 code unit.   In the case,
    828 that this is not true, it is addressed in post-processing.
    829 </p>
    830 <p id="idp732200">
    831 The final step of combined filtering occurs during
    832 the process of reducing markup data to tag bytes
    833 preceding each significant XML transition as described
    834 in section~\ref{section:arch:contentstream}.  Overall, icXML
    835 avoids separate buffer copying operations for each of the
    836 these filtering steps, paying the cost of parallel
    837 deletion and inverse transposition only once. 
    838 Currently, icXML employs the parallel-prefix compress algorithm
    839 of Steele~\cite{HackersDelight}  Performance
    840 is independent of the number of positions deleted.
    841 Future versions of icXML are expected to
    842 take advantage of the parallel extract operation~\cite{HilewitzLee2006}
    843 that Intel is now providing in its Haswell architecture.
    844 </p>
    845 </div>
    846 <div class="section" id="idp733168">
    847 <h3 class="title" style="clear: both">Content Stream</h3>
    848 <p id="idp733512">
    849 A relatively-unique concept for icXML is the use of a filtered content stream.
    850 Rather that parsing an XML document in its original format, the input is transformed
    851 into one that is easier for the parser to iterate through and produce the sequential
    852 output.
    853 In , the source data
    854 
    855 is transformed into
    856 
    857 
    858 through the parallel filtering algorithm, described in section \ref{sec:parfilter}.
    859 </p>
    860 <p id="idp735224">
    861 Combined with the symbol stream, the parser traverses the content stream to effectively
    862 reconstructs the input document in its output form.
    863 The initial <span class="ital">0</span> indicates an empty content string. The following <code class="code">&gt;</code>
    864 indicates that a start tag without any attributes is the first element in this text and
    865 the first unused symbol, <code class="code">document</code>, is the element name.
    866 Succeeding that is the content string <code class="code">fee</code>, which is null-terminated in accordance
    867 with the Xerces API specification. Unlike Xerces, no memory-copy operations
    868 are required to produce these strings, which as Figure~\ref{fig:xerces-profile} shows
    869 accounts for 6.83% of Xerces's execution time.
    870 Additionally, it is cheap to locate the terminal character of each string:
    871 using the String End bitstream, the Parabix Subsystem can effectively calculate the offset of each
    872 null character in the content stream in parallel, which in turn means the parser can
    873 directly jump to the end of every string without scanning for it.
    874 </p>
    875 <p id="idp737880">
    876 Following <code class="code">'fee'</code> is a <code class="code">=</code>, which marks the existence of an attribute.
    877 Because all of the intra-element was performed in the Parabix Subsystem, this must be a legal attribute.
    878 Since attributes can only occur within start tags and must be accompanied by a textual value,
    879 the next symbol in the symbol stream must be the element name of a start tag,
    880 and the following one must be the name of the attribute and the string that follows the <code class="code">=</code> must be its value.
    881 However, the subsequent <code class="code">=</code> is not treated as an independent attribute because the parser has yet to
    882 read a <code class="code">&gt;</code>, which marks the end of a start tag. Thus only one symbol is taken from the symbol stream and
    883 it (along with the string value) is added to the element.
    884 Eventually the parser reaches a <code class="code">/</code>, which marks the existence of an end tag. Every end tag requires an
    885 element name, which means they require a symbol. Inter-element validation whenever an empty tag is detected to
    886 ensure that the appropriate scope-nesting rules have been applied.
    887 </p>
    888 </div>
    889 <div class="section" id="idp740320">
    890 <h3 class="title" style="clear: both">Namespace Handling</h3>
    891 <p id="idp740968">
    892 In XML, namespaces prevents naming conflicts when multiple vocabularies are used together.
    893 It is especially important when a vocabulary application-dependant meaning, such as when
    894 XML or SVG documents are embedded within XHTML files.
    895 Namespaces are bound to uniform resource identifiers (URIs), which are strings used to identify
    896 specific names or resources.
    897 On line 1 of Figure \ref{fig:namespace1}, the <code class="code">xmlns</code> attribute instructs the XML
    898 processor to bind the prefix <code class="code">p</code> to the URI '<code class="code">pub.net</code>' and the default (empty)
    899 prefix to <code class="code">book.org</code>. Thus to the XML processor, the <code class="code">title</code> on line 2 and
    900 <code class="code">price</code> on line 4 both read as <code class="code">"book.org":title</code> and <code class="code">"book.org":price</code>
    901 respectively, whereas on line 3 and 5, <code class="code">p:name</code> and <code class="code">price</code> are seen as
    902 <code class="code">"pub.net":name</code> and <code class="code">"pub.net":price</code>. Even though the actual element name
    903 <code class="code">price</code>, due to namespace scoping rules they are viewed as two uniquely-named items
    904 because the current vocabulary is determined by the namespace(s) that are in-scope.
    905 </p>
    906 <p id="idp744856">
    907 
    908 </p>
    909 <p id="idp745128">
    910 In both Xerces and icXML, every URI has a one-to-one mapping to a URI ID.
    911 These persist for the lifetime of the application through the use of a global URI pool.
    912 Xerces maintains a stack of namespace scopes that is pushed (popped) every time a start tag (end tag) occurs
    913 in the document. Because a namespace declaration affects the entire element, it must be processed prior to
    914 grammar validation. This is a costly process considering that a typical namespaced XML document only comes
    915 in one of two forms:
    916 (1) those that declare a set of namespaces upfront and never change them, and
    917 (2) those that repeatedly modify the namespaces in predictable patterns.
    918 </p>
    919 <p id="idp745320">
    920 For that reason, icXML contains an independent namespace stack and utilizes bit vectors to cheaply perform
    921 
    922 
    923 When a prefix is declared (e.g., <code class="code">xmlns:p="pub.net"</code>), a namespace binding is created that maps
    924 the prefix (which are assigned Prefix IDs in the symbol resolution process) to the URI.
    925 Each unique namespace binding has a unique namespace id (NSID) and every prefix contains a bit vector marking every
    926 NSID that has ever been associated with it within the document. For example, in Table \ref{tbl:namespace1}, the
    927 prefix binding set of <code class="code">p</code> and <code class="code">xmlns</code> would be <code class="code">01</code> and <code class="code">11</code> respectively.
    928 To resolve the in-scope namespace binding for each prefix, a bit vector of the currently visible namespaces is
    929 maintained by the system. By ANDing the prefix bit vector with the currently visible namespaces, the in-scope
    930 NSID can be found using a bit-scan intrinsic.
    931 A namespace binding table, similar to Table \ref{tbl:namespace1}, provides the actual URI ID.
    932 </p>
    933 <p id="idp749096">
    934 
    935 </p>
    936 <p id="idp749368">
    937 
    938 
    939 
    940 
    941 </p>
    942 <p id="idp750936">
    943 To ensure that scoping rules are adhered to,
    944 whenever a start tag is encountered, any modification to the currently visible namespaces is calculated and stored
    945 within a stack of bit vectors denoting the locally modified namespace bindings. When an end tag is found, the
    946 currently visible namespaces is XORed with the vector at the top of the stack.
    947 This allows any number of changes to be performed at each scope-level with a constant time.
    948 
    949 </p>
    950 </div>
    951 <div class="section" id="idp751920">
    952 <h3 class="title" style="clear: both">Error Handling</h3>
    953 <p id="idp752264">
    954 
    955 Xerces outputs error messages in two ways: through the programmer API and as thrown objects for fatal errors.
    956 As Xerces parses a file, it uses context-dependant logic to assess whether the next character is legal;
    957 if not, the current state determines the type and severity of the error.
    958 icXML emits errors in the similar manner—but how it discovers them is substantially different.
    959 Recall that in Figure \ref{fig:icxml-arch}, icXML is divided into two sections: the Parabix Subsystem and Markup Processor,
    960 each with its own system for detecting and producing error messages.
    961 </p>
    962 <p id="idp753400">
    963 Within the Parabix Subsystem, all computations are performed in parallel, a block at a time.
    964 Errors are derived as artifacts of bitstream calculations, with a 1-bit marking the byte-position of an error within a block,
    965 and the type of error is determined by the equation that discovered it.
    966 The difficulty of error processing in this section is that in Xerces the line and column number must be given
    967 with every error production. Two major issues exist because of this:
    968 (1) line position adheres to XML white-normalization rules; as such, some sequences of characters, e.g., a carriage return
    969 followed by a line feed, are counted as a single new line character.
    970 (2) column position is counted in characters, not bytes or code units;
    971 thus multi-code-unit code-points and surrogate character pairs are all counted as a single column position.
    972 Note that typical XML documents are error-free but the calculation of the
    973 line/column position is a constant overhead in Xerces.
    974 To reduce this, icXML pushes the bulk cost of the line/column calculation to the occurrence of the error and
    975 performs the minimal amount of book-keeping necessary to facilitate it.
    976 icXML leverages the byproducts of the Character Set Adapter (CSA) module and amalgamates the information
    977 within the Line Column Tracker (LCT).
    978 One of the CSA's major responsibilities is transcoding an input text.
    979 During this process, white-space normalization rules are applied and multi-code-unit and surrogate characters are detected
    980 and validated.
    981 A <span class="ital">line-feed bitstream</span>, which marks the positions of the normalized new lines characters, is a natural derivative of
    982 this process.
    983 Using an optimized population count algorithm, the line count can be summarized cheaply for each valid block of text.
    984 
    985 Column position is more difficult to calculate.
    986 It is possible to scan backwards through the bitstream of new line characters to determine the distance (in code-units)
    987 between the position between which an error was detected and the last line feed. However, this distance may exceed
    988 than the actual character position for the reasons discussed in (2).
    989 To handle this, the CSA generates a <span class="ital">skip mask</span> bitstream by ORing together many relevant bitstreams,
    990 such as all trailing multi-code-unit and surrogate characters, and any characters that were removed during the
    991 normalization process.
    992 When an error is detected, the sum of those skipped positions is subtracted from the distance to determine the actual
    993 column number.
    994 </p>
    995 <p id="idp757256">
    996 The Markup Processor is a state-driven machine. As such, error detection within it is very similar to Xerces.
    997 However, reporting the correct line/column is a much more difficult problem.
    998 The Markup Processor parses the content stream, which is a series of tagged UTF-16 strings.
    999 Each string is normalized in accordance with the XML specification.
    1000 All symbol data and unnecessary whitespace is eliminated from the stream;
    1001 thus its impossible to derive the current location using only the content stream.
    1002 To calculate the location, the Markup Processor borrows three additional pieces of information from the Parabix Subsystem:
    1003 the line-feed, skip mask, and a <span class="ital">deletion mask stream</span>, which is a bitstream denoting the (code-unit) position of every
    1004 datum that was suppressed from the source during the production of the content stream.
    1005 Armed with these, it is possible to calculate the actual line/column using
    1006 the same system as the Parabix Subsystem until the sum of the negated deletion mask stream is equal to the current position.
    1007 </p>
    1008 </div>
    1009 </div>
    1010 <div class="section" id="idp758960">
    1011 <h2 class="title" style="clear: both">Multithreading with Pipeline Parallelism</h2>
    1012 <p id="idp759352">
    1013 As discussed in section \ref{background:xerces}, Xerces can be considered a FSM application.
    1014 These are "embarrassingly sequential."\cite{Asanovic:EECS-2006-183} and notoriously difficult to parallelize.
    1015 However, icXML is designed to organize processing into logical layers.   
    1016 In particular, layers within the Parabix Subsystem are designed to operate
    1017 over significant segments of input data before passing their outputs on for
    1018 subsequent processing.  This fits well into the general model of pipeline
    1019 parallelism, in which each thread is in charge of a single module or group
    1020 of modules.
    1021 </p>
    1022 <p id="idp760888">
    1023 The most straightforward division of work in icXML is to separate
    1024 the Parabix Subsystem and the Markup Processor into distinct logical layers into two separate stages.
    1025 The resultant application, <span class="ital">icXML-p</span>, is a course-grained software-pipeline application.
    1026 In this case, the Parabix Subsystem thread <code class="code">T<sub>1</sub></code> reads 16k of XML input <code class="code">I</code> at a time and produces the
    1027 content, symbol and URI streams, then stores them in a pre-allocated shared data structure <code class="code">S</code>.
    1028 The Markup Processor thread <code class="code">T<sub>2</sub></code> consumes <code class="code">S</code>, performs well-formedness and grammar-based validation,
    1029 and the provides parsed XML data to the application through the Xerces API. 
    1030 The shared data structure is implemented using a ring buffer,
    1031 where every entry contains an independent set of data streams.
    1032 In the examples of Figure \ref{threads_timeline1} and \ref{threads_timeline2}, the ring buffer has four entries.
    1033 A lock-free mechanism is applied to ensure that each entry can only be read or written by one thread at the same time.
    1034 In Figure \ref{threads_timeline1} the processing time of <code class="code">T<sub>1</sub></code> is longer than <code class="code">T<sub>2</sub></code>;
    1035 thus <code class="code">T<sub>2</sub></code> always waits for <code class="code">T<sub>1</sub></code> to write to the shared memory.
    1036 Figure \ref{threads_timeline2} illustrates the scenario in which <code class="code">T<sub>1</sub></code> is faster
    1037 and must wait for <code class="code">T<sub>2</sub></code> to finish reading the shared data before it can reuse the memory space.
    1038 </p>
    1039 <p id="idp765984">
    1040 
    1041 </p>
    1042 <p id="idp766256">
    1043 Overall, our design is intended to benefit a range of applications.
    1044 Conceptually, we consider two design points.
    1045 The first, the parsing performed by the Parabix Subsystem dominates at 67% of the overall cost,
    1046 with the cost of application processing (including the driver logic within the Markup Processor) at 33%.   
    1047 The second is almost the opposite scenario, the cost of application processing dominates at 60%,
    1048 while the cost of XML parsing represents an overhead of 40%.
    1049 </p>
    1050 <p id="idp766448">
    1051 Our design is predicated on a goal of using the Parabix
    1052 framework to achieve a 50% to 100% improvement in the parsing engine itself.   
    1053 In a best case scenario,
    1054 a 100% improvement of the Parabix Subsystem for the design point in which
    1055 XML parsing dominates at 67% of the total application cost.
    1056 In this case, the single-threaded icXML should achieve a 1.5x speedup over Xerces
    1057 so that the total application cost reduces to 67% of the original. 
    1058 However, in icXML-p, our ideal scenario gives us two well-balanced threads
    1059 each performing about 33% of the original work.   
    1060 In this case, Amdahl's law predicts that we could expect up to a 3x speedup at best.
    1061 </p>
    1062 <p id="idp768176">
    1063 At the other extreme of our design range, we consider an application
    1064 in which core parsing cost is 40%.  Assuming the 2x speedup of
    1065 the Parabix Subsystem over the corresponding Xerces core, single-threaded
    1066 icXML delivers a 25% speedup.  However, the most significant
    1067 aspect of our two-stage multi-threaded design then becomes the
    1068 ability to hide the entire latency of parsing within the serial time
    1069 required by the application.  In this case, we achieve
    1070 an overall speedup in processing time by 1.67x.
    1071 </p>
    1072 <p id="idp768880">
    1073 Although the structure of the Parabix Subsystem allows division of the work into
    1074 several pipeline stages and has been demonstrated to be effective
    1075 for four pipeline stages in a research prototype \cite{HPCA2012},
    1076 our analysis here suggests that the further pipelining of work within
    1077 the Parabix Subsystem is not worthwhile if the cost of application logic is little as
    1078 33% of the end-to-end cost using Xerces.  To achieve benefits of
    1079 further parallelization with multi-core technology, there would
    1080 need to be reductions in the cost of application logic that
    1081 could match reductions in core parsing cost.
    1082 </p>
    1083 </div>
    1084 <div class="section" id="idp769752">
    1085 <h2 class="title" style="clear: both">Performance</h2>
    1086 <p id="idp770088">
    1087 We evaluate Xerces-C++ 3.1.1, icXML, icXML-p against two benchmarking applications:
    1088 the Xerces C++ SAXCount sample application,
    1089 and a real world GML to SVG transformation application.
    1090 We investigated XML parser performance using an Intel Core i7 quad-core
    1091 (Sandy Bridge) processor (3.40GHz, 4 physical cores, 8 threads (2 per core),
    1092 32+32 kB (per core) L1 cache,
    1093 256 kB (per core) L2 cache,
    1094 8 MB L3 cache) running the 64-bit version of Ubuntu 12.04 (Linux).
    1095 </p>
    1096 <p id="idp770752">
    1097 We analyzed the execution profiles of each XML parser
    1098 using the performance counters found in the processor.
    1099 We chose several key hardware events that provide insight into the profile of each
    1100 application and indicate if the processor is doing useful work. 
    1101 The set of events included in our study are:
    1102 processor cycles, branch instructions, branch mispredictions,
    1103 and cache misses. The Performance Application Programming Interface
    1104 (PAPI) Version 5.5.0 \cite{papi} toolkit
    1105 was installed on the test system to facilitate the
    1106 collection of hardware performance monitoring
    1107 statistics. In addition, we used the Linux perf \cite{perf} utility
    1108 to collect per core hardware events.
    1109 </p>
    1110 <div class="section" id="idp771632">
    1111 <h3 class="title" style="clear: both">Xerces C++ SAXCount</h3>
    1112 <p id="idp771976">
    1113 Xerces comes with sample applications that demonstrate salient features of the parser.
    1114 SAXCount is the simplest such application:
    1115 it counts the elements, attributes and characters of a given XML file using the (event based) SAX API
    1116 and prints out the totals.
    1117 </p>
    1118 <p id="idp772440">
    1119 
    1120 </p>
    1121 <p id="idp772712">
    1122 Table \ref{XMLDocChars} shows the document characteristics of the XML input
    1123 files selected for the Xerces C++ SAXCount benchmark. The jaw.xml
    1124 represents document-oriented XML inputs and contains the three-byte and four-byte UTF-8 sequence
    1125 required for the UTF-8 encoding of Japanese characters. The remaining data files are data-oriented
    1126 XML documents and consist entirely of single byte encoded ASCII characters.
    1127 </p>
    1128 <p id="idp772904">
    1129 A key predictor of the overall parsing performance of an XML file is markup density\footnote{
    1130   Markup Density: the ratio of markup bytes used to define the structure of the document vs. its file size.}.
    1131 This metric has substantial influence on the performance of traditional recursive descent XML parsers
    1132 because it directly corresponds to the number of state transitions that occur when parsing a document.
    1133 We use a mixture of document-oriented and
    1134 data-oriented XML files to analyze performance over a spectrum
    1135 of markup densities.
    1136 </p>
    1137 <p id="idp774656">
    1138 Figure \ref{perf_SAX} compares the performance of Xerces, icXML and pipelined icXML in terms of
    1139 CPU cycles per byte for the SAXCount application.
    1140 The speedup for icXML over Xerces is 1.3x to 1.8x.
    1141 With two threads on the multicore machine, icXML-p can achieve speedup up to 2.7x.
    1142 Xerces is substantially slowed by dense markup
    1143 but icXML is less affected through a reduction in branches and the use of parallel-processing techniques.
    1144 icXML-p performs better as markup-density increases because the work performed by each stage is
    1145 well balanced in this application.
    1146 </p>
    1147 <p id="idp775424">
    1148 
    1149 </p>
    1150 </div>
    1151 <div class="section" id="idp776016">
     771<p id="idp496800"> A key predictor of the overall parsing performance of an XML file is markup
     772            density\footnote{ Markup Density: the ratio of markup bytes used to define the structure
     773            of the document vs. its file size.}. This metric has substantial influence on the
     774            performance of traditional recursive descent XML parsers because it directly corresponds
     775            to the number of state transitions that occur when parsing a document. We use a mixture
     776            of document-oriented and data-oriented XML files to analyze performance over a spectrum
     777            of markup densities. </p>
     778<p id="idp497808"> Figure \ref{perf_SAX} compares the performance of Xerces, icXML and pipelined icXML
     779            in terms of CPU cycles per byte for the SAXCount application. The speedup for icXML over
     780            Xerces is 1.3x to 1.8x. With two threads on the multicore machine, icXML-p can achieve
     781            speedup up to 2.7x. Xerces is substantially slowed by dense markup but icXML is less
     782            affected through a reduction in branches and the use of parallel-processing techniques.
     783            icXML-p performs better as markup-density increases because the work performed by each
     784            stage is well balanced in this application. </p>
     785<p id="idp498848">
     786           
     787         </p>
     788</div>
     789<div class="section" id="idp499872">
    1152790<h3 class="title" style="clear: both">GML2SVG</h3>
    1153 <p id="idp776336"></p>
    1154 </div>
    1155 </div>
    1156 <div class="section" id="idp776592">
     791<p id="idp500512"></p>
     792</div>
     793</div>
     794<div class="section" id="idp501024">
    1157795<h2 class="title" style="clear: both">Conclusion and Future Work</h2>
    1158 <p id="idp776944">
    1159 This paper is the first case study documenting the significant
    1160 performance benefits that may be realized through the integration
    1161 of parallel bitstream technology into existing widely-used software libraries.
    1162 In the case of the Xerces-C++ XML parser, the
    1163 combined integration of SIMD and multicore parallelism was
    1164 shown capable of dramatic producing dramatic increases in
    1165 throughput and reductions in branch mispredictions and cache misses.
    1166 The modified parser, going under the name icXML is designed
    1167 to provide the full functionality of the original Xerces library
    1168 with complete compatibility of APIs.  Although substantial
    1169 re-engineering was required to realize the
    1170 performance potential of parallel technologies, this
    1171 is an important case study demonstrating the general
    1172 feasibility of these techniques.
    1173 </p>
    1174 <p id="idp778768">
    1175 The further development of icXML to move beyond 2-stage
    1176 pipeline parallelism is ongoing, with realistic prospects for
    1177 four reasonably balanced stages within the library.  For
    1178 applications such as GML2SVG which are dominated by time
    1179 spent on XML parsing, such a multistage pipelined parsing
    1180 library should offer substantial benefits. 
    1181 </p>
    1182 <p id="idp779304">
    1183 The example of XML parsing may be considered prototypical
    1184 of finite-state machines applications which have sometimes
    1185 been considered "embarassingly sequential" and so
    1186 difficult to parallelize that "nothing works."  So the
    1187 case study presented here should be considered an important
    1188 data point in making the case that parallelization can
    1189 indeed be helpful across a broad array of application types.
    1190 </p>
    1191 <p id="idp780416">
    1192 To overcome the software engineering challenges in applying
    1193 parallel bitstream technology to existing software systems,
    1194 it is clear that better library and tool support is needed.
    1195 The techniques used in the implementation of icXML and
    1196 documented in this paper could well be generalized for
    1197 applications in other contexts and automated through
    1198 the creation of compiler technology specifically supporting
    1199 parallel bitstream programming.
    1200 </p>
    1201 </div>
    1202 <div class="bibliography" id="idp781376">
     796<p id="idp501712"> This paper is the first case study documenting the significant performance benefits
     797         that may be realized through the integration of parallel bitstream technology into existing
     798         widely-used software libraries. In the case of the Xerces-C++ XML parser, the combined
     799         integration of SIMD and multicore parallelism was shown capable of dramatic producing
     800         dramatic increases in throughput and reductions in branch mispredictions and cache misses.
     801         The modified parser, going under the name icXML is designed to provide the full
     802         functionality of the original Xerces library with complete compatibility of APIs. Although
     803         substantial re-engineering was required to realize the performance potential of parallel
     804         technologies, this is an important case study demonstrating the general feasibility of
     805         these techniques. </p>
     806<p id="idp502992"> The further development of icXML to move beyond 2-stage pipeline parallelism is
     807         ongoing, with realistic prospects for four reasonably balanced stages within the library.
     808         For applications such as GML2SVG which are dominated by time spent on XML parsing, such a
     809         multistage pipelined parsing library should offer substantial benefits. </p>
     810<p id="idp503760"> The example of XML parsing may be considered prototypical of finite-state machines
     811         applications which have sometimes been considered "embarassingly
     812         sequential" and so difficult to parallelize that "nothing
     813         works." So the case study presented here should be considered an important data
     814         point in making the case that parallelization can indeed be helpful across a broad array of
     815         application types. </p>
     816<p id="idp505136"> To overcome the software engineering challenges in applying parallel bitstream
     817         technology to existing software systems, it is clear that better library and tool support
     818         is needed. The techniques used in the implementation of icXML and documented in this paper
     819         could well be generalized for applications in other contexts and automated through the
     820         creation of compiler technology specifically supporting parallel bitstream programming.
     821      </p>
     822</div>
     823<div class="bibliography" id="idp506624">
    1203824<h2 class="title" style="clear:both">Bibliography</h2>
    1204825<p class="bibliomixed" id="XMLChip09">[Leventhal and Lemoine 2009] Leventhal, Michael and
     
    1259880</div>
    1260881</div>
    1261 <div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em; font-style: italic">Balisage Series on Markup Technologies</h3></div>
    1262 </div>
    1263 </body>
    1264 </html>
    1265882<div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em">
    1266883<i>Balisage:</i> <small>The Markup Conference</small>
  • docs/Balisage13/Bal2013came0601/Bal2013came0601.xml

    r3044 r3047  
    179179            expected that a comprehensive restructuring is required, involving all aspects of the
    180180            parser. </para>
    181          <para>
    182             <!-- In order to obtain systematic acceleration of the Xerces parser,-->
    183             <!-- it should be expected that a comprehensive restructuring-->
    184             <!-- is required, involving all aspects of the parser.-->
    185             <!-- FIGURE
    186 \begin{figure}[h]
    187 \begin{tabular}{r|l}
    188 Time (\%) & Function Name \\
    189 \hline
    190 13.29   &       XMLUTF8Transcoder::transcodeFrom \\
    191 7.45    &       IGXMLScanner::scanCharData \\
    192 6.83    &       memcpy \\
    193 5.83    &       XMLReader::getNCName \\
    194 4.67    &       IGXMLScanner::buildAttList \\
    195 4.54    &       RefHashTableOf\verb|<>|::findBucketElem \\
    196 4.20    &       IGXMLScanner::scanStartTagNS \\
    197 3.75    &       ElemStack::mapPrefixToURI \\
    198 3.58    &       ReaderMgr::getNextChar \\
    199 3.20    &       IGXMLScanner::basicAttrValueScan \\
    200 \end{tabular}
    201 \caption{Execution Time of Top 10 Xerces Functions}
    202 \label {fig:xerces-profile}
    203 \end{figure}
    204 -->
    205          </para>
     181             <table>
     182                  <caption>
     183                     <para>Execution Time of Top 10 Xerces Functions</para>
     184                  </caption>
     185                  <colgroup>
     186                     <col align="left" valign="top"/>
     187                     <col align="left" valign="top"/>
     188                  </colgroup>
     189                  <thead><tr><th>Time (%) </th><th> Function Name </th></tr></thead>
     190                  <tbody>
     191<tr valign="top"><td>13.29      </td>   <td>XMLUTF8Transcoder::transcodeFrom </td></tr>
     192<tr valign="top"><td>7.45       </td>   <td>IGXMLScanner::scanCharData </td></tr>
     193<tr valign="top"><td>6.83       </td>   <td>memcpy </td></tr>
     194<tr valign="top"><td>5.83       </td>   <td>XMLReader::getNCName </td></tr>
     195<tr valign="top"><td>4.67       </td>   <td>IGXMLScanner::buildAttList </td></tr>
     196<tr valign="top"><td>4.54       </td>   <td>RefHashTableO&lt;&gt;::findBucketElem </td></tr>
     197<tr valign="top"><td>4.20       </td>   <td>IGXMLScanner::scanStartTagNS </td></tr>
     198<tr valign="top"><td>3.75       </td>   <td>ElemStack::mapPrefixToURI </td></tr>
     199<tr valign="top"><td>3.58       </td>   <td>ReaderMgr::getNextChar </td></tr>
     200<tr valign="top"><td>3.20       </td>   <td>IGXMLScanner::basicAttrValueScan </td></tr>
     201                  </tbody>
     202               </table>
    206203      </section>
    207204      <section>
     
    844841            and characters of a given XML file using the (event based) SAX API and prints out the
    845842            totals. </para>
    846          <para>
    847             <!-- TABLE
    848 \begin{table}
    849 \begin{center}
    850 {
    851 \footnotesize
    852 \begin{tabular}{|l||l|l|l|l|l|}
    853 \hline
    854 File Name               & jaw.xml               & road.gml      & po.xml        & soap.xml \\ \hline   
    855 File Type               & document              & data          & data          & data   \\ \hline     
    856 File Size (kB)          & 7343                  & 11584         & 76450         & 2717 \\ \hline
    857 Markup Item Count       & 74882                 & 280724        & 4634110       & 18004 \\ \hline
    858 Markup Density          & 0.13                  & 0.57          & 0.76          & 0.87  \\ \hline
    859 \end{tabular}
    860 }
    861 \end{center}
    862 \caption{XML Document Characteristics}
    863 \label{XMLDocChars}
    864 \end{table}
    865 -->
    866          </para>
     843
    867844         <para> Table \ref{XMLDocChars} shows the document characteristics of the XML input files
    868845            selected for the Xerces C++ SAXCount benchmark. The jaw.xml represents document-oriented
    869846            XML inputs and contains the three-byte and four-byte UTF-8 sequence required for the
    870847            UTF-8 encoding of Japanese characters. The remaining data files are data-oriented XML
    871             documents and consist entirely of single byte encoded ASCII characters. </para>
     848            documents and consist entirely of single byte encoded ASCII characters.
     849  <table>
     850                  <caption>
     851                     <para>XML Document Characteristics</para>
     852                  </caption>
     853                  <colgroup>
     854                     <col align="left" valign="top"/>
     855                     <col align="left" valign="top"/>
     856                     <col align="left" valign="top"/>
     857                     <col align="left" valign="top"/>
     858                     <col align="left" valign="top"/>
     859                  </colgroup>
     860                  <tbody>
     861 <tr><td>File Name              </td><td> jaw.xml               </td><td> road.gml      </td><td> po.xml        </td><td> soap.xml </td></tr>
     862<tr><td>File Type               </td><td> document              </td><td> data          </td><td> data          </td><td> data   </td></tr>     
     863<tr><td>File Size (kB)          </td><td> 7343                  </td><td> 11584         </td><td> 76450         </td><td> 2717 </td></tr>
     864<tr><td>Markup Item Count       </td><td> 74882                 </td><td> 280724        </td><td> 4634110       </td><td> 18004 </td></tr>
     865  <tr><td>Markup Density                </td><td> 0.13                  </td><td> 0.57          </td><td> 0.76          </td><td> 0.87  </td></tr>
     866                  </tbody>
     867               </table>           
     868</para>     
    872869         <para> A key predictor of the overall parsing performance of an XML file is markup
    873870            density\footnote{ Markup Density: the ratio of markup bytes used to define the structure
Note: See TracChangeset for help on using the changeset viewer.