Apr 19, 2013, 1:21:43 AM (6 years ago)

Added other xslt script to DO_XSTL script.

1 edited


  • docs/Balisage13/Bal2013came0601/Bal2013came0601.html

    r3041 r3043  
     1<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
    12<html lang="en">
    34<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
    5 <link rel="stylesheet" href="balisage-plain.css" type="text/css">
     6<link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
    67<meta name="keywords" content="">
    1011<i>Balisage:</i> <small>The Markup Conference</small>
    12 <div lang="en" class="article">
    13 <div class="titlepage">
    14 <h2 class="article-title" id="idp26800"></h2>
     13<html lang="en">
     16<link rel="stylesheet" href="balisage-proceedings.css" type="text/css">
     17<meta name="generator" content="Balisage Conference Proceedings XSLT (v1.2)">
     18<script type="text/javascript">
     19    function toggle(folderID) {
     20      folder = document.getElementById("folder-"+folderID);
     21      icon = document.getElementById("icon-"+folderID)
     22      // need to:
     23      //   switch folder.style.display between 'none' and 'block'
     24      //   switch between collapse and expand icons
     25      if (folder.style.display != "block") {
     26        folder.style.display = "block";
     27        icon.src = "minus.png" ;
     28        icon.alt = "collapse" ;
     29      }
     30      else {
     31        folder.style.display = "none";
     32        icon.src = "plus.png" ;
     33        icon.alt = "expand" ;
     34      };
     35      return;
     36    }
     38   function hidecite(citeID) {
     39     cite = document.getElementById(citeID);
     40     cite.style.display = "none";
     41     return;
     42   }
     44   function showcite(citeID,anchorID) {
     45     cite = document.getElementById(citeID);
     47     citeLeft = cite.style.left;
     48     citeTop = cite.style.top;
     50     if (citeLeft != (getLeft(anchorID)+"px") ||
     51         citeTop != (getTop(anchorID)+"px")) {
     52       cite.style.display = "none";
     53     }
     55     if (cite.style.display != "table-cell") {
     56        movebox(citeID, anchorID);
     57        cite.style.display = "table-cell";
     58     }
     59     else {
     60       cite.style.display = "none";
     61     };
     62     return;
     63   }
     65   function movebox(citeID, anchorID) {
     67     cite = document.getElementById(citeID);
     69     // alert(cite.offsetWidth + " by " + cite.offsetHeight)
     71     horizontalOffset = getLeft(anchorID);
     72     // horizontalOffset = (inMain(anchorID)) ?
     73     // (horizontalOffset - 260) : (horizontalOffset + 20)
     74     // (horizontalOffset - (20 + cite.offsetWidth)) : (horizontalOffset + 20)
     76     verticalOffset = getTop(anchorID);
     77     // verticalOffset = (inMain(anchorID)) ?
     78     // (verticalOffset - 20) : (verticalOffset + 20)
     79     // (verticalOffset - (20 + cite.offsetHeight)) : (verticalOffset + 20)
     81     /*
     82     horizontalOffset = getAbsoluteLeft(anchorID) - getScrollLeft(anchorID) + 20;
     83     if (inMain(anchorID)) {
     84       horizontalOffset = horizontalOffset - 300;
     85     }
     86     verticalOffset = getAbsoluteTop(anchorID) - getScrollTop(anchorID) - 40;
     87     if (inMain(anchorID)) {
     88       verticalOffset = verticalOffset - 300;
     89     }
     90     */
     92     cite.style.left = horizontalOffset + "px";
     93     cite.style.top = verticalOffset + "px";
     94   }
     96   function getLeft(objectID) {
     97     var left = getAbsoluteLeft(objectID) - getScrollLeft(objectID);
     98     left = (inMain(objectID)) ? (left - 260) : (left + 20)   
     99     return left;
     100   }
     102   function getTop(objectID) {
     103     var top = getAbsoluteTop(objectID) - getScrollTop(objectID);
     104     top = (inMain(objectID)) ? (top - 50) : (top + 20)
     105     return top;     
     106   }
     108   function getAbsoluteLeft(objectId) {
     109   // Get an object left position from the upper left viewport corner
     110     o = document.getElementById(objectId)
     111     oLeft = o.offsetLeft            // Get left position from the parent object
     112     while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
     113       oParent = o.offsetParent    // Get parent object reference
     114       oLeft += oParent.offsetLeft // Add parent left position
     115       o = oParent
     116      }
     117    return oLeft
     118    }
     120    function getAbsoluteTop(objectId) {
     121    // Get an object top position from the upper left viewport corner
     122      o = document.getElementById(objectId)
     123      oTop = o.offsetTop            // Get top position from the parent object
     124      while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
     125        oParent = o.offsetParent  // Get parent object reference
     126        oTop += oParent.offsetTop // Add parent top position
     127        o = oParent
     128      }
     129    return oTop
     130    }
     132   function getScrollLeft(objectId) {
     133     // Get a left scroll position
     134     o = document.getElementById(objectId)
     135     oLeft = o.scrollLeft            // Get left position from the parent object
     136     while(o.offsetParent!=null) {   // Parse the parent hierarchy up to the document element
     137       oParent = o.offsetParent    // Get parent object reference
     138       oLeft += oParent.scrollLeft // Add parent left position
     139       o = oParent
     140      }
     141    return oLeft
     142    }
     144    function getScrollTop(objectId) {
     145    // Get a right scroll position
     146      o = document.getElementById(objectId)
     147      oTop = o.scrollTop            // Get top position from the parent object
     148      while(o.offsetParent!=null) { // Parse the parent hierarchy up to the document element
     149        oParent = o.offsetParent  // Get parent object reference
     150        oTop += oParent.scrollTop // Add parent top position
     151        o = oParent
     152      }
     153    return oTop
     154    }
     156    function inMain(objectId) {
     157    // returns true if in div#main
     158      o = document.getElementById(objectId)
     159      while(o.parentNode != null) { // Parse the parent hierarchy up to div#main
     160        oParent = o.parentNode
     161        if (o.id == "main") { return true; }
     162        o = oParent;
     163      }
     164    return false;
     165    }
     168   /*
     169   function showcite(citeID) {
     170      cite = document.getElementById(citeID);
     171      if (cite.style.display != "table-cell") {
     172        cite.style.display = "table-cell";
     173      }
     174      else {
     175        cite.style.display = "none";
     176      };
     177      return;
     178    }
     179    */
     181      </script>
     184<div class="inline-citation" id="cite-XMLChip09" style="display:none;width: 240px">
     185<a class="quiet" href="javascript:hidecite('cite-XMLChip09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Leventhal, Michael and
     186         Eric Lemoine 2009. The XML chip at 6 years. Proceedings of International Symposium on
     187         Processing XML Efficiently 2009, Montréal.</p>
     189<div class="inline-citation" id="cite-Datapower09" style="display:none;width: 240px">
     190<a class="quiet" href="javascript:hidecite('cite-Datapower09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Salz, Richard,
     191         Heather Achilles, and David Maze. 2009. Hardware and software trade-offs in the IBM
     192         DataPower XML XG4 processor card. Proceedings of International Symposium on Processing XML
     193         Efficiently 2009, Montréal.</p>
     195<div class="inline-citation" id="cite-PPoPP08" style="display:none;width: 240px">
     196<a class="quiet" href="javascript:hidecite('cite-PPoPP08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D. 2007. A Case Study
     197         in SIMD Text Processing with Parallel Bit Streams UTF-8 to UTF-16 Transcoding. Proceedings
     198         of 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming 2008, Salt
     199         Lake City, Utah. On the Web at <a href="http://research.ihost.com/ppopp08/" class="link" target="_new">http://research.ihost.com/ppopp08/</a>.</p>
     201<div class="inline-citation" id="cite-CASCON08" style="display:none;width: 240px">
     202<a class="quiet" href="javascript:hidecite('cite-CASCON08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D.,
     203         Kenneth S Herdy, and Dan Lin. 2008. High Performance XML Parsing Using Parallel Bit Stream
     204         Technology. Proceedings of CASCON 2008. 13th ACM SIGPLAN Symposium on Principles and
     205         Practice of Parallel Programming 2008, Toronto.</p>
     207<div class="inline-citation" id="cite-SVGOpen08" style="display:none;width: 240px">
     208<a class="quiet" href="javascript:hidecite('cite-SVGOpen08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Herdy, Kenneth
     209         S., Robert D. Cameron and David S. Burggraf. 2008. High Performance GML to SVG
     210         Transformation for the Visual Presentation of Geographic Data in Web-Based Mapping Systems.
     211         Proceedings of SVG Open 6th International Conference on Scalable Vector Graphics,
     212         Nuremburg. On the Web at
     213            <a href="http://www.svgopen.org/2008/papers/74-HighPerformance_GML_to_SVG_Transformation_for_the_Visual_Presentation_of_Geographic_Data_in_WebBased_Mapping_Systems/" class="link" target="_new">http://www.svgopen.org/2008/papers/74-HighPerformance_GML_to_SVG_Transformation_for_the_Visual_Presentation_of_Geographic_Data_in_WebBased_Mapping_Systems/</a>.</p>
     215<div class="inline-citation" id="cite-Ross06" style="display:none;width: 240px">
     216<a class="quiet" href="javascript:hidecite('cite-Ross06')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Ross, Kenneth A. 2006. Efficient hash
     217         probes on modern processors. Proceedings of ICDE, 2006. ICDE 2006, Atlanta. On the Web at
     218            <a href="www.cs.columbia.edu/~kar/pubsk/icde2007.pdf" class="link" target="_new">www.cs.columbia.edu/~kar/pubsk/icde2007.pdf</a>.</p>
     220<div class="inline-citation" id="cite-ASPLOS09" style="display:none;width: 240px">
     221<a class="quiet" href="javascript:hidecite('cite-ASPLOS09')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Cameron, Robert D. and Dan
     222         Lin. 2009. Architectural Support for SWAR Text Processing with Parallel Bit Streams: The
     223         Inductive Doubling Principle. Proceedings of ASPLOS 2009, Washington, DC.</p>
     225<div class="inline-citation" id="cite-Wu08" style="display:none;width: 240px">
     226<a class="quiet" href="javascript:hidecite('cite-Wu08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Wu, Yu, Qi Zhang, Zhiqiang Yu and
     227         Jianhui Li. 2008. A Hybrid Parallel Processing for XML Parsing and Schema Validation.
     228         Proceedings of Balisage 2008, Montréal. On the Web at
     229            <a href="http://www.balisage.net/Proceedings/vol1/html/Wu01/BalisageVol1-Wu01.html" class="link" target="_new">http://www.balisage.net/Proceedings/vol1/html/Wu01/BalisageVol1-Wu01.html</a>.</p>
     231<div class="inline-citation" id="cite-u8u16" style="display:none;width: 240px">
     232<a class="quiet" href="javascript:hidecite('cite-u8u16')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">u8u16 - A High-Speed UTF-8 to UTF-16
     233         Transcoder Using Parallel Bit Streams Technical Report 2007-18. 2007. School of Computing
     234         Science Simon Fraser University, June 21 2007.</p>
     236<div class="inline-citation" id="cite-XML10" style="display:none;width: 240px">
     237<a class="quiet" href="javascript:hidecite('cite-XML10')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Extensible Markup Language (XML) 1.0 (Fifth
     238         Edition) W3C Recommendation 26 November 2008. On the Web at
     239            <a href="http://www.w3.org/TR/REC-xml/" class="link" target="_new">http://www.w3.org/TR/REC-xml/</a>.</p>
     241<div class="inline-citation" id="cite-Unicode" style="display:none;width: 240px">
     242<a class="quiet" href="javascript:hidecite('cite-Unicode')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">The Unicode Consortium. 2009. On the Web at
     243            <a href="http://unicode.org/" class="link" target="_new">http://unicode.org/</a>.</p>
     245<div class="inline-citation" id="cite-Pex06" style="display:none;width: 240px">
     246<a class="quiet" href="javascript:hidecite('cite-Pex06')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> Hilewitz, Y. and Ruby B. Lee.
     247         2006. Fast Bit Compression and Expansion with Parallel Extract and Parallel Deposit
     248         Instructions. Proceedings of the IEEE 17th International Conference on Application-Specific
     249         Systems, Architectures and Processors (ASAP), pp. 65-72, September 11-13, 2006.</p>
     251<div class="inline-citation" id="cite-InfoSet" style="display:none;width: 240px">
     252<a class="quiet" href="javascript:hidecite('cite-InfoSet')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">XML Information Set (Second Edition) W3C
     253         Recommendation 4 February 2004. On the Web at
     254         <a href="http://www.w3.org/TR/xml-infoset/" class="link" target="_new">http://www.w3.org/TR/xml-infoset/</a>.</p>
     256<div class="inline-citation" id="cite-Saxon" style="display:none;width: 240px">
     257<a class="quiet" href="javascript:hidecite('cite-Saxon')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">SAXON The XSLT and XQuery Processor. On the Web
     258         at <a href="http://saxon.sourceforge.net/" class="link" target="_new">http://saxon.sourceforge.net/</a>.</p>
     260<div class="inline-citation" id="cite-Kay08" style="display:none;width: 240px">
     261<a class="quiet" href="javascript:hidecite('cite-Kay08')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> Kay, Michael Y. 2008. Ten Reasons Why Saxon
     262         XQuery is Fast, IEEE Data Engineering Bulletin, December 2008.</p>
     264<div class="inline-citation" id="cite-AElfred" style="display:none;width: 240px">
     265<a class="quiet" href="javascript:hidecite('cite-AElfred')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex"> The Ælfred XML Parser. On the Web at
     266            <a href="http://saxon.sourceforge.net/aelfred.html" class="link" target="_new">http://saxon.sourceforge.net/aelfred.html</a>.</p>
     268<div class="inline-citation" id="cite-JNI" style="display:none;width: 240px">
     269<a class="quiet" href="javascript:hidecite('cite-JNI')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">Hitchens, Ron. Java NIO. O'Reilly, 2002.</p>
     271<div class="inline-citation" id="cite-Expat" style="display:none;width: 240px">
     272<a class="quiet" href="javascript:hidecite('cite-Expat')" style="font-size:90%"><img src="eks.png" alt="[x]" style="float:right;clear:both;margin:1px"></a><p style="margin:0ex">The Expat XML Parser.
     273            <a href="http://expat.sourceforge.net/" class="link" target="_new">http://expat.sourceforge.net/</a>.</p>
     275<div id="mast"><div class="content">
     276<h2 class="article-title" id="idp543912"></h2>
    15277<div class="author">
    16278<h3 class="author">Nigel Medforth</h3>
    61323<h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:"></a>&gt;</code></h5>
    63 <div class="abstract">
    64 <p class="title"><b>Abstract</b></p>
    65 <p id="idp28432">Prior research on the acceleration of XML processing
     325<div class="mast-box">
     326<p class="title"><a href="javascript:toggle('idp544272')" class="quiet"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp544272"></a> <span onclick="javascript:toggle('idp544272');return true">Abstract</span></p>
     327<div class="folder" id="folder-idp544272" style="display:none"><p id="idp544728">Prior research on the acceleration of XML processing
    66328using SIMD and multi-core parallelism has lead to
    67329a number of interesting research prototypes.  This work
    77339When coupled with pipeline parallelism on dual core processors,
    78340improvements of 2x and beyond were realized.
    79 </p>
    80 </div>
    81 <hr>
    83343<div class="toc">
    84344<p><b>Table of Contents</b></p>
    86 <dt><span class="section"><a href="#idp15032" class="toc">Introduction</a></span></dt>
    87 <dt><span class="section"><a href="#idp15928" class="toc">Background</a></span></dt>
     346<dt><span class="section"><a href="#idp531824" class="toc">Introduction</a></span></dt>
     347<dt><span class="section"><a href="#idp532720" class="toc">Background</a></span></dt>
    89 <dt><span class="section"><a href="#idp16248" class="toc">Xerces C++ Structure</a></span></dt>
    90 <dt><span class="section"><a href="#idp22392" class="toc">The Parabix Framework</a></span></dt>
    91 <dt><span class="section"><a href="#idp178296" class="toc">Sequential vs. Parallel Paradigm</a></span></dt>
     349<dt><span class="section"><a href="#idp533040" class="toc">Xerces C++ Structure</a></span></dt>
     350<dt><span class="section"><a href="#idp539296" class="toc">The Parabix Framework</a></span></dt>
     351<dt><span class="section"><a href="#idp690352" class="toc">Sequential vs. Parallel Paradigm</a></span></dt>
    93 <dt><span class="section"><a href="#idp181248" class="toc">Architecture</a></span></dt>
     353<dt><span class="section"><a href="#idp693304" class="toc">Architecture</a></span></dt>
    95 <dt><span class="section"><a href="#idp181568" class="toc">Overview</a></span></dt>
    96 <dt><span class="section"><a href="#idp195064" class="toc">Character Set Adapters</a></span></dt>
    97 <dt><span class="section"><a href="#idm9304" class="toc">Combined Parallel Filtering</a></span></dt>
    98 <dt><span class="section"><a href="#idp221352" class="toc">Content Stream</a></span></dt>
    99 <dt><span class="section"><a href="#idp228528" class="toc">Namespace Handling</a></span></dt>
    100 <dt><span class="section"><a href="#idp240160" class="toc">Error Handling</a></span></dt>
     355<dt><span class="section"><a href="#idp693624" class="toc">Overview</a></span></dt>
     356<dt><span class="section"><a href="#idp706920" class="toc">Character Set Adapters</a></span></dt>
     357<dt><span class="section"><a href="#idp509944" class="toc">Combined Parallel Filtering</a></span></dt>
     358<dt><span class="section"><a href="#idp733168" class="toc">Content Stream</a></span></dt>
     359<dt><span class="section"><a href="#idp740320" class="toc">Namespace Handling</a></span></dt>
     360<dt><span class="section"><a href="#idp751920" class="toc">Error Handling</a></span></dt>
    102 <dt><span class="section"><a href="#idp247200" class="toc">Multithreading with Pipeline Parallelism</a></span></dt>
    103 <dt><span class="section"><a href="#idp257920" class="toc">Performance</a></span></dt>
     362<dt><span class="section"><a href="#idp758960" class="toc">Multithreading with Pipeline Parallelism</a></span></dt>
     363<dt><span class="section"><a href="#idp769752" class="toc">Performance</a></span></dt>
    105 <dt><span class="section"><a href="#idp259800" class="toc">Xerces C++ SAXCount</a></span></dt>
    106 <dt><span class="section"><a href="#idp264184" class="toc">GML2SVG</a></span></dt>
     365<dt><span class="section"><a href="#idp771632" class="toc">Xerces C++ SAXCount</a></span></dt>
     366<dt><span class="section"><a href="#idp776016" class="toc">GML2SVG</a></span></dt>
    108 <dt><span class="section"><a href="#idp264760" class="toc">Conclusion and Future Work</a></span></dt>
     368<dt><span class="section"><a href="#idp776592" class="toc">Conclusion and Future Work</a></span></dt>
    111 <div class="section" id="idp15032">
     371<div class="mast-box">
     372<p class="title"><a href="javascript:toggle('idp545808')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp545808"></a> <span onclick="javascript:toggle('idp545808');return true">Nigel Medforth</span></p>
     373<div class="folder" id="folder-idp545808" style="display:none">
     374<h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:nmedfort@sfu.ca">nmedfort@sfu.ca</a>&gt;</code></h5>
     375<div class="affiliation">
     376<p class="jobtitle">Developer</p>
     377<p class="orgname">International Characters Inc.</p>
     379<div class="affiliation">
     380<p class="jobtitle">Graduate Student, School of Computing Science</p>
     381<p class="orgname">Simon Fraser University </p>
     383<div class="personblurb">
     384<p id="idp546720">Nigel Medforth is a M.Sc. student at Simon Fraser University and the lead developer of icXML.
     385             He earned a Bachelor of Technology in Information Technology at Kwantlen Polytechnic University in 2009
     386             and was awarded the Dean’s Medal for Outstanding Achievement.</p>
     387<p id="idp547496">Nigel is currently researching ways to leverage both the Parabix framework and stream-processing models
     388             to further accelerate XML parsing within icXML.</p>
     392<div class="mast-box">
     393<p class="title"><a href="javascript:toggle('idp525576')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp525576"></a> <span onclick="javascript:toggle('idp525576');return true">Kenneth Herdy</span></p>
     394<div class="folder" id="folder-idp525576" style="display:none">
     395<h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:ksherdy@sfu.ca">ksherdy@sfu.ca</a>&gt;</code></h5>
     396<div class="affiliation">
     397<p class="jobtitle">Graduate Student, School of Computing Science</p>
     398<p class="orgname">Simon Fraser University </p>
     400<div class="personblurb">
     401<p id="idp526440"> Ken Herdy completed an Advanced Diploma of Technology in Geographical Information
     402               Systems at the British Columbia Institute of Technology in 2003 and earned a Bachelor
     403               of Science in Computing Science with a Certificate in Spatial Information Systems at
     404               Simon Fraser University in 2005.
     405                                                </p>
     406<p id="idp526976"> Ken is currently pursuing PhD studies in Computing Science at Simon Fraser
     407               University with industrial scholarship support from the Natural Sciences and
     408               Engineering Research Council of Canada, the Mathematics of Information Technology and
     409               Complex Systems NCE, and the BC Innovation Council. His research focus is an analysis
     410               of the principal techniques that may be used to improve XML processing performance in
     411               the context of the Geography Markup Language (GML).
     412                                                </p>
     416<div class="mast-box">
     417<p class="title"><a href="javascript:toggle('idp528720')" class="linkbox"><img class="toc-icon" src="plus.png" alt="expand" id="icon-idp528720"></a> <span onclick="javascript:toggle('idp528720');return true">Rob Cameron</span></p>
     418<div class="folder" id="folder-idp528720" style="display:none">
     419<h5 class="author-email"><code class="email">&lt;<a class="email" href="mailto:cameron@cs.sfu.ca">cameron@cs.sfu.ca</a>&gt;</code></h5>
     420<div class="affiliation">
     421<p class="jobtitle">Professor of Computing Science</p>
     422<p class="orgname">Simon Fraser University</p>
     424<div class="affiliation">
     425<p class="jobtitle">Chief Technology Officer</p>
     426<p class="orgname">International Characters, Inc.</p>
     428<div class="personblurb"><p id="idp1352">Dr. Rob Cameron is Professor of Computing Science and Associate Dean of
     429             Applied Sciences at Simon Fraser
     430                University.   His research interests include programming language
     431                and software system technology, with a specific focus on high performance
     432                text processing using SIMD and multicore parallelism.  He is the developer
     433                of the REX XML shallow parser as well as the parallel bit stream (Parabix)
     434                framework for SIMD text processing. 
     435              </p></div>
     439<div id="navbar"></div>
     440<div id="balisage-header" style="background-color: #6699CC">
     441<a class="quiet" href="http://www.balisage.net"><img style="float:right;border:none" alt="Balisage logo" height="130" src="http://balisage.net/Logo/BalisageSeries-logo.png"></a><h2 class="page-header">Balisage: The Markup Conference</h2>
     442<h1 class="page-header">Proceedings preview</h1>
     444<div id="main">
     445<div class="article">
     446<h2 class="article-title" id="idp543912"></h2>
     447<div class="section" id="idp531824">
    112448<h2 class="title" style="clear: both">Introduction</h2>
    113 <p id="idp15352"></p>
    114 <p id="idp15480"></p>
    115 <p id="idp15608"></p>
    116 <p id="idp15736"></p>
    117 </div>
    118 <div class="section" id="idp15928">
     449<p id="idp532144"></p>
     450<p id="idp532272"></p>
     451<p id="idp532400"></p>
     452<p id="idp532528"></p>
     454<div class="section" id="idp532720">
    119455<h2 class="title" style="clear: both">Background</h2>
    120 <div class="section" id="idp16248">
     456<div class="section" id="idp533040">
    121457<h3 class="title" style="clear: both">Xerces C++ Structure</h3>
    122 <p id="idp16568">
     458<p id="idp533360">
    123459The Xerces C++ parser
    138474parsing as well as a DOM tree-based parsing interface.
    140 <p id="idp18112">
     476<p id="idp534960">
    159495is required, involving all aspects of the parser.
    161 <p id="idp21496">
    166 </p>
    167 </div>
    168 <div class="section" id="idp22392">
     497<p id="idp538344">
     504<div class="section" id="idp539296">
    169505<h3 class="title" style="clear: both">The Parabix Framework</h3>
    170 <p id="idp22712">
     506<p id="idp539616">
    171507The Parabix (parallel bit stream) framework is a transformative approach to XML parsing
    172508(and other forms of text processing.) The key idea is to exploit the availability of wide
    189525multiple classes can share the classification cost.
    191 <p id="idp170240">
    193 </p>
    194 <p id="idp171696">
     527<p id="idp682296">
     530<p id="idp683752">
    195531Consider, for example, the XML source data stream shown in the first line of .
    196532The remaining lines of this figure show several parallel bit streams that are computed in Parabix-style
    209545attribute names and attribute values of tags. 
    211 <p id="idp175032">
     547<p id="idp687088">
    212548Two intuitions may help explain how the Parabix approach can lead
    213549to improved XML parsing performance. The first is that
    224560should provide substantial benefit.
    226 <p id="idp175992">
     562<p id="idp688048">
    227563Previous studies have shown that the Parabix approach improves many aspects of XML processing,
    228564including transcoding \cite{Cameron2008}, character classification and validation,
    237573they lacked the functionality required by full XML parsers.
    239 <p id="idp177744">
     575<p id="idp689800">
    240576Commercial XML processors support transcoding of multiple character sets and can parse and
    241577validate against multiple document vocabularies.
    246 <div class="section" id="idp178296">
     582<div class="section" id="idp690352">
    247583<h3 class="title" style="clear: both">Sequential vs. Parallel Paradigm</h3>
    248 <p id="idp178616">
     584<p id="idp690672">
    249585Xerces—like all traditional XML parsers—processes XML documents sequentially.
    250586Each character is examined to distinguish between the
    254590validation and content processing modes.
    256 <p id="idp179688">
     592<p id="idp691744">
    257593In other words, Xerces belongs to an equivalent class applications termed FSM applications\footnote{
    258594  Herein FSM applications are considered software systems whose behaviour is defined by the inputs,
    261597Unfortunately, textual data tends to be unpredictable and any character could induce a state transition.
    263 <p id="idp180352">
     599<p id="idp692408">
    264600Parabix-style XML parsers utilize a concept of layered processing.
    265601A block of source text is transformed into a set of lexical bitstreams,
    274 <div class="section" id="idp181248">
     610<div class="section" id="idp693304">
    275611<h2 class="title" style="clear: both">Architecture</h2>
    276 <div class="section" id="idp181568">
     612<div class="section" id="idp693624">
    277613<h3 class="title" style="clear: both">Overview</h3>
    278 <p id="idp182016">
     614<p id="idp694072">
    279615icXML is more than an optimized version of Xerces. Many components were grouped, restructured and
    280616rearchitected with pipeline parallelism in mind.
    301637the user-defined DTD and schema grammar(s) before passing it to the end-user.
    303 <p id="idp185640">
    305 </p>
    306 <p id="idp185896">
     639<p id="idp697696">
     642<p id="idp697952">
    307643In icXML functions are grouped into logical components.
    308644As shown in Figure \ref{fig:icxml-arch}, two major categories exist: (1) the Parabix Subsystem and (2) the Markup Processor.
    323659From here, two data-independent branches exist: the Symbol Resolver and Content Preparation Unit.
    325 <p id="idp188856">
     661<p id="idp700912">
    326662A typical XML file contains few unique element and attribute names—but each of them will occur frequently.
    327663icXML stores these as distinct data structures, called symbols, each with their own global identifier (GID).
    329665the raw data to produce a sequence of GIDs, called the <span class="ital">symbol stream</span>.
    331 <p id="idp190496">
     667<p id="idp702552">
    332668The final components of the Parabix Subsystem are the <span class="ital">Content Preparation Unit</span> and <span class="ital">Content Stream Generator</span>.
    333669The former takes the (transposed) basis bitstreams and selectively filters them, according to the
    335671filtered streams into the tagged UTF-16 <span class="ital">content stream</span>, discussed in Section \ref{section:arch:contentstream}.
    337 <p id="idp191984">
     673<p id="idp704040">
    338674Combined, the symbol and content stream form icXML's compressed IR of the XML document.
    339675The <span class="ital">Markup Processor</span>~parses the IR to validate and produce the sequential output for the end user.
    347683However, preprocessing associated with each symbol greatly reduces the work of this stage.
    349 <p id="idp194728">
    351 </p>
    352 </div>
    353 <div class="section" id="idp195064">
     685<p id="idp706600">
     689<div class="section" id="idp706920">
    354690<h3 class="title" style="clear: both">Character Set Adapters</h3>
    355 <p id="idp195712">
     691<p id="idp707568">
    356692In Xerces, all input is transcoded into UTF-16 to simplify the parsing costs of Xerces itself and
    357693provide the end-consumer with a single encoding format.
    362698transcoding imposes at least a cost of buffer copying.
    364 <p id="idp196496">
     700<p id="idp708352">
    365701In icXML, however,  the concept of Character Set Adapters (CSAs) is used to minimize transcoding costs.
    366702Given a specified input encoding, a CSA is responsible for checking that
    370706is performed using the parallel bitstream representation of the source input.
    372 <p id="idp197216">
     708<p id="idp709072">
    373709An important observation is that many character sets are an
    374710extension to the legacy 7-bit ASCII character set.  This includes the
    378714serves to compute lexical item streams for all such ASCII-based character sets.
    380 <p id="idp197848">
     716<p id="idp709704">
    381717A second observation is that—regardless of which character set is used—quite
    382718often all of the characters in a particular block of input will be within the ASCII range.
    387723UTF-16 form are each set to zero in this case.
    389 <p id="idp199400">
     725<p id="idp711256">
    390726A third observation is that repeated transcoding of the names of XML
    391727elements, attributes and so on can be avoided by using a look-up mechanism.
    398734additional cost.
    400 <p id="idp200184">
     736<p id="idp712040">
    401737The cost of individual character transcoding is avoided whenever a block of input is
    402738confined to the ASCII subset and for all but the first occurrence of any XML element or attribute name.
    414 <div class="section" id="idm9304">
     750<div class="section" id="idp509944">
    415751<h3 class="title" style="clear: both">Combined Parallel Filtering</h3>
    416 <p id="idm8952">
     752<p id="idp510296">
    417753As just mentioned, UTF-8 to UTF-16 transcoding involves marking
    418754all but the last bytes of multi-byte UTF-8 sequences as
    435771UTF-16 bitstreams\cite{Cameron2008}.
    437 <p id="idm6424">
    447 </p>
    448 <p id="idm3968">
     773<p id="idp512784">
     784<p id="idp515240">
    449785Rather than immediately paying the
    450786costs of deletion and transposition just for transcoding,
    468 <p id="idm1464">
     804<p id="idp517744">
    469805In essence, the deletion masks for transcoding and
    470806for line break normalization each represent a bitwise
    473809applied once.
    475 <p id="idp217752">
     811<p id="idp517936">
    476812A further application of combined filtering
    477813is the processing of XML character and entity
    492828that this is not true, it is addressed in post-processing.
    494 <p id="idp220384">
     830<p id="idp732200">
    495831The final step of combined filtering occurs during
    496832the process of reducing markup data to tag bytes
    510 <div class="section" id="idp221352">
     846<div class="section" id="idp733168">
    511847<h3 class="title" style="clear: both">Content Stream</h3>
    512 <p id="idp221696">
     848<p id="idp733512">
    513849A relatively-unique concept for icXML is the use of a filtered content stream.
    514850Rather that parsing an XML document in its original format, the input is transformed
    522858through the parallel filtering algorithm, described in section \ref{sec:parfilter}.
    524 <p id="idp223408">
     860<p id="idp735224">
    525861Combined with the symbol stream, the parser traverses the content stream to effectively
    526862reconstructs the input document in its output form.
    537873directly jump to the end of every string without scanning for it.
    539 <p id="idp226064">
     875<p id="idp737880">
    540876Following <code class="code">'fee'</code> is a <code class="code">=</code>, which marks the existence of an attribute.
    541877Because all of the intra-element was performed in the Parabix Subsystem, this must be a legal attribute.
    553 <div class="section" id="idp228528">
     889<div class="section" id="idp740320">
    554890<h3 class="title" style="clear: both">Namespace Handling</h3>
    555 <p id="idp229176">
     891<p id="idp740968">
    556892In XML, namespaces prevents naming conflicts when multiple vocabularies are used together.
    557893It is especially important when a vocabulary application-dependant meaning, such as when
    568904because the current vocabulary is determined by the namespace(s) that are in-scope.
    570 <p id="idp233096">
    572 </p>
    573 <p id="idp233368">
     906<p id="idp744856">
     909<p id="idp745128">
    574910In both Xerces and icXML, every URI has a one-to-one mapping to a URI ID.
    575911These persist for the lifetime of the application through the use of a global URI pool.
    581917(2) those that repeatedly modify the namespaces in predictable patterns.
    583 <p id="idp233560">
     919<p id="idp745320">
    584920For that reason, icXML contains an independent namespace stack and utilizes bit vectors to cheaply perform
    595931A namespace binding table, similar to Table \ref{tbl:namespace1}, provides the actual URI ID.
    597 <p id="idp237336">
    599 </p>
    600 <p id="idp237608">
    605 </p>
    606 <p id="idp239176">
     933<p id="idp749096">
     936<p id="idp749368">
     942<p id="idp750936">
    607943To ensure that scoping rules are adhered to,
    608944whenever a start tag is encountered, any modification to the currently visible namespaces is calculated and stored
    615 <div class="section" id="idp240160">
     951<div class="section" id="idp751920">
    616952<h3 class="title" style="clear: both">Error Handling</h3>
    617 <p id="idp240504">
     953<p id="idp752264">
    619955Xerces outputs error messages in two ways: through the programmer API and as thrown objects for fatal errors.
    624960each with its own system for detecting and producing error messages.
    626 <p id="idp241640">
     962<p id="idp753400">
    627963Within the Parabix Subsystem, all computations are performed in parallel, a block at a time.
    628964Errors are derived as artifacts of bitstream calculations, with a 1-bit marking the byte-position of an error within a block,
    657993column number.
    659 <p id="idp245496">
     995<p id="idp757256">
    660996The Markup Processor is a state-driven machine. As such, error detection within it is very similar to Xerces.
    661997However, reporting the correct line/column is a much more difficult problem.
    674 <div class="section" id="idp247200">
     1010<div class="section" id="idp758960">
    6751011<h2 class="title" style="clear: both">Multithreading with Pipeline Parallelism</h2>
    676 <p id="idp247568">
     1012<p id="idp759352">
    6771013As discussed in section \ref{background:xerces}, Xerces can be considered a FSM application.
    6781014These are "embarrassingly sequential."\cite{Asanovic:EECS-2006-183} and notoriously difficult to parallelize.
    6841020of modules.
    686 <p id="idp249104">
     1022<p id="idp760888">
    6871023The most straightforward division of work in icXML is to separate
    6881024the Parabix Subsystem and the Markup Processor into distinct logical layers into two separate stages.
    7011037and must wait for <code class="code">T<sub>2</sub></code> to finish reading the shared data before it can reuse the memory space.
    703 <p id="idp254152">
    705 </p>
    706 <p id="idp254424">
     1039<p id="idp765984">
     1042<p id="idp766256">
    7071043Overall, our design is intended to benefit a range of applications.
    7081044Conceptually, we consider two design points.
    7121048while the cost of XML parsing represents an overhead of 40%.
    714 <p id="idp254616">
     1050<p id="idp766448">
    7151051Our design is predicated on a goal of using the Parabix
    7161052framework to achieve a 50% to 100% improvement in the parsing engine itself.   
    7241060In this case, Amdahl's law predicts that we could expect up to a 3x speedup at best.
    726 <p id="idp256344">
     1062<p id="idp768176">
    7271063At the other extreme of our design range, we consider an application
    7281064in which core parsing cost is 40%.  Assuming the 2x speedup of
    7341070an overall speedup in processing time by 1.67x.
    736 <p id="idp257048">
     1072<p id="idp768880">
    7371073Although the structure of the Parabix Subsystem allows division of the work into
    7381074several pipeline stages and has been demonstrated to be effective
    748 <div class="section" id="idp257920">
     1084<div class="section" id="idp769752">
    7491085<h2 class="title" style="clear: both">Performance</h2>
    750 <p id="idp258256">
     1086<p id="idp770088">
    7511087We evaluate Xerces-C++ 3.1.1, icXML, icXML-p against two benchmarking applications:
    7521088the Xerces C++ SAXCount sample application,
    75810948 MB L3 cache) running the 64-bit version of Ubuntu 12.04 (Linux).
    760 <p id="idp258920">
     1096<p id="idp770752">
    7611097We analyzed the execution profiles of each XML parser
    7621098using the performance counters found in the processor.
    7721108to collect per core hardware events.
    774 <div class="section" id="idp259800">
     1110<div class="section" id="idp771632">
    7751111<h3 class="title" style="clear: both">Xerces C++ SAXCount</h3>
    776 <p id="idp260144">
     1112<p id="idp771976">
    7771113Xerces comes with sample applications that demonstrate salient features of the parser.
    7781114SAXCount is the simplest such application:
    7801116and prints out the totals.
    782 <p id="idp260608">
    784 </p>
    785 <p id="idp260880">
     1118<p id="idp772440">
     1121<p id="idp772712">
    7861122Table \ref{XMLDocChars} shows the document characteristics of the XML input
    7871123files selected for the Xerces C++ SAXCount benchmark. The jaw.xml
    7901126XML documents and consist entirely of single byte encoded ASCII characters.
    792 <p id="idp261072">
     1128<p id="idp772904">
    7931129A key predictor of the overall parsing performance of an XML file is markup density\footnote{
    7941130  Markup Density: the ratio of markup bytes used to define the structure of the document vs. its file size.}.
    7991135of markup densities.
    801 <p id="idp262824">
     1137<p id="idp774656">
    8021138Figure \ref{perf_SAX} compares the performance of Xerces, icXML and pipelined icXML in terms of
    8031139CPU cycles per byte for the SAXCount application.
    8091145well balanced in this application.
    811 <p id="idp263592">
    813 </p>
    814 </div>
    815 <div class="section" id="idp264184">
     1147<p id="idp775424">
     1151<div class="section" id="idp776016">
    8161152<h3 class="title" style="clear: both">GML2SVG</h3>
    817 <p id="idp264504"></p>
    818 </div>
    819 </div>
    820 <div class="section" id="idp264760">
     1153<p id="idp776336"></p>
     1156<div class="section" id="idp776592">
    8211157<h2 class="title" style="clear: both">Conclusion and Future Work</h2>
    822 <p id="idp265112">
     1158<p id="idp776944">
    8231159This paper is the first case study documenting the significant
    8241160performance benefits that may be realized through the integration
    8361172feasibility of these techniques.
    838 <p id="idp266936">
     1174<p id="idp778768">
    8391175The further development of icXML to move beyond 2-stage
    8401176pipeline parallelism is ongoing, with realistic prospects for
    8441180library should offer substantial benefits. 
    846 <p id="idp267472">
     1182<p id="idp779304">
    8471183The example of XML parsing may be considered prototypical
    8481184of finite-state machines applications which have sometimes
    8531189indeed be helpful across a broad array of application types.
    855 <p id="idp268584">
     1191<p id="idp780416">
    8561192To overcome the software engineering challenges in applying
    8571193parallel bitstream technology to existing software systems,
    866 <div class="bibliography" id="idp269544">
     1202<div class="bibliography" id="idp781376">
    8671203<h2 class="title" style="clear:both">Bibliography</h2>
    8681204<p class="bibliomixed" id="XMLChip09">[Leventhal and Lemoine 2009] Leventhal, Michael and
     1261<div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em; font-style: italic">Balisage Series on Markup Technologies</h3></div>
    9251265<div id="balisage-footer"><h3 style="font-family: serif; margin:0.25em">
    9261266<i>Balisage:</i> <small>The Markup Conference</small>
Note: See TracChangeset for help on using the changeset viewer.