Changeset 1335 for docs/HPCA2012


Ignore:
Timestamp:
Aug 21, 2011, 4:20:30 PM (8 years ago)
Author:
ashriram
Message:

Working on evaluation. Fixed Figure sizes

Location:
docs/HPCA2012
Files:
10 edited

Legend:

Unmodified
Added
Removed
  • docs/HPCA2012/04-methodology.tex

    r1302 r1335  
    6060\begin{table*}
    6161\begin{center}
     62{
     63\footnotesize
    6264\begin{tabular}{|l||l|l|l|l|l|}
    6365\hline
     
    6870Markup Density          & 0.07                  & 0.13                  & 0.57          & 0.76          & 0.87  \\ \hline
    6971\end{tabular}
     72}
    7073\end{center}
    7174\caption{XML Document Characteristics}
     
    7578\subsection{Workloads}\label{workloads}
    7679
    77 Markup density is defined
    78 as the ratio of the total markup contained within an XML file to the
    79 total XML document size.  This metric has substantial influence
    80 on the performance of traditional recursive descent XML parser implementations. 
    81 We use a mixture of document-oriented and data-oriented XML
    82 files in our study to provide workloads with a full spectrum of
    83 markup densities.
     80Markup density is defined as the ratio of the total markup contained
     81within an XML file to the total XML document size.  This metric has
     82substantial influence on the performance of traditional recursive
     83descent XML parser implementations.  We use a mixture of
     84document-oriented and data-oriented XML files in our study to provide
     85workloads with a full spectrum of markup densities.
    8486
    8587Table \ref{XMLDocChars} shows the document characteristics of the XML
    8688input files selected for this performance study.  The jawiki.xml and
    87 dewiki.xml XML files represent document-oriented XML inputs
    88 and contain the three-byte and four-byte UTF-8 sequence required for the UTF-8 encoding of Japanese and German characters respectively.  The remaining
    89 data files are data-oriented XML documents and consist entirely of single byte $7$-bit encoded ASCII characters. 
     89dewiki.xml XML files represent document-oriented XML inputs and
     90contain the three-byte and four-byte UTF-8 sequence required for the
     91UTF-8 encoding of Japanese and German characters respectively.  The
     92remaining data files are data-oriented XML documents and consist
     93entirely of single byte $7$-bit encoded ASCII characters.
    9094
    9195
     
    9599Intel. Table \ref{core2info} gives the hardware description of the
    96100Intel \CO{} machine.
    97 \begin{table}[h]
    98 \begin{center}
    99 \begin{tabular}{|l||l|}
     101
     102\begin{table*}[h]
     103\footnotesize
     104\begin{tabular}{|l||l|l|l|}
    100105\hline
    101 Processor & Intel Core2 Duo processor 6400  (2.13GHz) \\ \hline
    102 L1 Cache & 32KB I-Cache, 32KB D-Cache \\ \hline
    103 L2 Cache & 2MB \\ \hline
    104 Front Side Bus &  1066 MHz\\ \hline
    105 Memory  & 2GB \\ \hline
    106 Hard disk & 80GB SCSI \\ \hline
    107 Max TDP & 65W \\ \hline
     106Processor & Core2 Duo (2.13GHz) & i3-530 (2.93GHz) & Sandybridge (2.80GHz) \\ \hline
     107L1 D Cache & 32KB & 32KB & 32KB \\ \hline       
     108L2 Cache & Shared 2MB & 256KB/core & 256KB/core \\ \hline
     109L3 Cache & --- & 4MB  & 6MB \\ \hline
     110Bus or QPI &  1066Mhz Bus & 1333Mhz QPI & 1333Mhz QPI \\ \hline
     111Memory  & 2GB & 4GB & 6GB\\ \hline
     112Max TDP & 65W & 73W &  95W \\ \hline
    108113\end{tabular}
    109 \end{center}
    110 \caption{\CO{}}
    111 \label{core2info}
    112 \end{table}
     114\caption{Platform Hardware Specs}
     115\end{table*}
    113116
    114 \paragraph {Intel \CITHREE{}}
    115117Intel \CITHREE\ processor, code name Nehalem, produced by Intel. The
    116118intent of the selection of this processor is to serve as an example of a low end server
    117119processor. Table \ref{i3info} gives the hardware description of the
    118 Intel \CITHREE\ machine.
    119 
    120 \begin{table}[h]
    121 \begin{center}
    122 \begin{tabular}{|l||l|}
    123 \hline
    124 Processor & Intel i3-530 (2.93GHz) \\ \hline
    125 L1 Cache & 32KB I-Cache, 32K D-Cache \\ \hline 
    126 L2 Cache & 256KB \\ \hline
    127 L3 Cache & 4-MB \\ \hline
    128 Front Side Bus & 1333 MHz \\ \hline
    129 Memory  & 4GB \\ \hline
    130 Hard disk & SCSI 1TB \\ \hline
    131 Max TDP & 73W \\ \hline
    132 
    133 \end{tabular}
    134 \end{center}
    135 \caption{\CITHREE{}}
    136 \label{i3info}
    137 \end{table}
    138 
    139 \paragraph{Intel \CIFIVE{}}
    140 Intel \CIFIVE\  processor, code name \SB\, produced by
     120Intel \CITHREE\ machine. Intel \CIFIVE\  processor, code name \SB\, produced by
    141121Intel. Table \ref{sandybridgeinfo} gives the hardware description of the
    142122Intel \CITHREE\ machine.
    143 
    144 \begin{table}[h]
    145 \begin{center}
    146 \begin{tabular}{|l||l|}
    147 \hline
    148 Processor & Intel Sandybridge i5-2300 (2.80GHz) \\ \hline
    149 L1 Cache &  32KB I-Cache, 32K D-Cache \\ \hline
    150 L2 Cache &  4 X 256KB \\ \hline
    151 L3 Cache & 6-MB \\ \hline
    152 Front Side Bus &  1333 MHz\\ \hline
    153 Memory  &  6GB DDDR\\ \hline
    154 Hard disk &  SATA 1TB\\ \hline
    155 Max TDP & 95W \\ \hline
    156 
    157 \end{tabular}
    158 \end{center}
    159 \caption{\SB{}}
    160 \label{sandybridgeinfo}
    161 \end{table}
    162 
    163 \subsection{PMC Hardware Events}\label{events}
    164 
    165 Each of the hardware events selected relates to performance
    166 and energy features associated with
    167 one or more hardware units.   For example, total branch mispredictions
    168 relate to the branch predictor and branch target buffer capacity.
     123Each of the hardware events selected relates to performance and energy
     124features associated with one or more hardware units.  For example,
     125total branch mispredictions relate to the branch predictor and branch
     126target buffer capacity.
    169127
    170128The set of PMC events used included in this study are as follows.
    171 \begin{itemize}
    172 \item Processor Cycles
    173 \item Branch Instructions
    174 \item Branch Mispredictions
    175 \item Integer Instructions
    176 \item SIMD Instructions
    177 \item Cache Misses
    178 \end{itemize}
     129Processor Cycles, Branch Instructions, Branch Mispredictions, Integer
     130Instructions, SIMD Instructions and Cache Misses.
    179131
    180132\subsection{Energy Measurement}
  • docs/HPCA2012/05-corei3.tex

    r1302 r1335  
    33%some of the numbers are roughly calculated, needs to be recalculated for final version
    44\subsection{Cache behavior}
    5 \CITHREE\ has a three level cache hierarchy.  The approximate miss penalty for each cache
    6 level is 4, 11, and 36 cycles respectively.  Figure
    7 \ref{corei3_L1DM}, Figure \ref{corei3_L2DM} and Figure
    8 \ref{corei3_L3TM} show the L1, L2 and L3 data cache misses for each of the parsers.  Although XML parsing is non memory intensive
    9 application, cache misses for the Expat and Xerces parsers represent a 0.5 cycle per XML byte cost whereas the performance of the Parabix parsers remains essentially
    10 unaffected by data cache misses.  Cache misses not only consume additional CPU cycles but increase application energy consumption.  L1, L2, and L3 cache misses consume
    11 approximately 8.3nJ, 19nJ, and 40nJ respectively. As such, given a 1GB XML file as input, Expat and Xerces would consume over 0.6J and 0.9J respectively due to cache misses alone.
     5\CITHREE\ has a three level cache hierarchy.  The approximate miss
     6penalty for each cache level is 4, 11, and 36 cycles respectively.
     7Figure \ref{corei3_L1DM}, Figure \ref{corei3_L2DM} and Figure
     8\ref{corei3_L3TM} show the L1, L2 and L3 data cache misses for each of
     9the parsers.  Although XML parsing is non memory intensive
     10application, cache misses for the Expat and Xerces parsers represent a
     110.5 cycle per XML byte cost whereas the performance of the Parabix
     12parsers remains essentially unaffected by data cache misses.  Cache
     13misses not only consume additional CPU cycles but increase application
     14energy consumption.  L1, L2, and L3 cache misses consume approximately
     158.3nJ, 19nJ, and 40nJ respectively. As such, given a 1GB XML file as
     16input, Expat and Xerces would consume over 0.6J and 0.9J respectively
     17due to cache misses alone.
    1218%With a 1GB input file, Expat would consume more than 0.6J and Xercesn
    1319%would consume 0.9J on cache misses alone.
     
    1521
    1622\begin{figure}
    17 \begin{center}
    18 \includegraphics[width=0.5\textwidth]{plots/corei3_L1DM.pdf}
    19 \end{center}
    20 \caption{\CITHREE\ --- L1 Data Cache Misses (y-axis: Cache Misses per kB)}
     23\subfigure[L1 Misses]{
     24\includegraphics[width=0.32\textwidth]{plots/corei3_L1DM.pdf}
    2125\label{corei3_L1DM}
    22 \end{figure}
    23 
    24 \begin{figure}
    25 \begin{center}
    26 \includegraphics[width=0.5\textwidth]{plots/corei3_L2DM.pdf}
    27 \end{center}
    28 \caption{\CITHREE\ --- L2 Data Cache Misses (y-axis: Cache Misses per kB)}
     26}
     27\subfigure[L2 Misses]{
     28\includegraphics[width=0.32\textwidth]{plots/corei3_L2DM.pdf}
    2929\label{corei3_L2DM}
    30 \end{figure}
    31 
    32 \begin{figure}
    33 \begin{center}
    34 \includegraphics[width=0.5\textwidth]{plots/corei3_L3CM.pdf}
    35 \end{center}
    36 \caption{\CITHREE\ --- L3 Cache Misses (y-axis: Cache Misses per kB)}
    37 \label{corei3_L3TM}
     30}
     31\subfigure[L3 Misses]{
     32\includegraphics[width=0.32\textwidth]{plots/corei3_L3CM.pdf}
     33\label{corei3_L3DM}
     34}
     35\caption{Cache Misses per kB of input data.}
    3836\end{figure}
    3937
    4038\subsection{Branch Mispredictions}
    41 Despite improvements in branch prediction, branch misprediction penalties contribute
    42 significantly to XML parsing performance. On modern commodity processors the cost of a single branch
    43 misprediction is commonly cited as over 10 CPU cycles.  As shown in
    44 Figure \ref{corei3_BM}, the cost of branch mispredictions for the Expat parser
    45 can be over 7 cycles per XML byte---this cost alone is equal to the average total cost for Parabix2 to process each byte of XML.
     39Despite improvements in branch prediction, branch misprediction
     40penalties contribute significantly to XML parsing performance. On
     41modern commodity processors the cost of a single branch misprediction
     42is commonly cited as over 10 CPU cycles.  As shown in Figure
     43\ref{corei3_BM}, the cost of branch mispredictions for the Expat
     44parser can be over 7 cycles per XML byte---this cost alone is equal to
     45the average total cost for Parabix2 to process each byte of XML.
    4646
    47 In general, reducing the branch misprediction rate is difficult in text-based XML parsing
    48 applications. This is due in part to the variable length nature of the syntactic elements contained within XML documents, a data dependent characterstic,
    49 as well as the extensive set of syntax constraints imposed by the XML 1.0 specification. As such, traditional byte-at-a-time XML parsers generate a performance limiting
    50 number of branch mispredictions.  As shown in Figure \ref{corei3_BR}, Xerces averages up to 13
    51 branches per XML byte processed on high density markup.
     47In general, reducing the branch misprediction rate is difficult in
     48text-based XML parsing applications. This is due in part to the
     49variable length nature of the syntactic elements contained within XML
     50documents, a data dependent characterstic, as well as the extensive
     51set of syntax constraints imposed by the XML 1.0 specification. As
     52such, traditional byte-at-a-time XML parsers generate a performance
     53limiting number of branch mispredictions.  As shown in Figure
     54\ref{corei3_BR}, Xerces averages up to 13 branches per XML byte
     55processed on high density markup.
    5256
    53 The performance improvement of Parabix1 in terms of branch mispredictions results from the veritable elimination of conditional branch instructions in scanning. Leveraging the processor built-in {\em bit scan}
    54 operation together with parallel bit stream technology Parabix1 can scan up to 64 bytes of source XML with a single {\em bit scan} instruction. In comparison, a byte-at-a-time parser must
     57The performance improvement of Parabix1 in terms of branch
     58mispredictions results from the veritable elimination of conditional
     59branch instructions in scanning. Leveraging the processor built-in
     60{\em bit scan} operation together with parallel bit stream technology
     61Parabix1 can scan up to 64 bytes of source XML with a single {\em bit
     62  scan} instruction. In comparison, a byte-at-a-time parser must
    5563process a conditional branch instruction per XML byte scanned.
    5664
    57 As shown in Figure \ref{corei3_BR}, Parabix2 processing is almost branch free. Utilizing a new parallel scanning technique based on bit stream addition, Parabix2 exhibits minimal dependence on source XML markup density. Figure \ref{corei3_BR} displays this lack of data dependence via the constant number of branch
    58 mispredictions shown for each of the source XML files.
     65As shown in Figure \ref{corei3_BR}, Parabix2 processing is almost
     66branch free. Utilizing a new parallel scanning technique based on bit
     67stream addition, Parabix2 exhibits minimal dependence on source XML
     68markup density. Figure \ref{corei3_BR} displays this lack of data
     69dependence via the constant number of branch mispredictions shown for
     70each of the source XML files.
    5971% Parabix1 minimize the branches by using parallel bit
    6072% streams.  Parabix1 still have a few branches for each block of 128
     
    6476% dependency on the markup density of the workloads.
    6577
    66 \begin{figure}
    67 \begin{center}
    68 \includegraphics[width=0.5\textwidth]{plots/corei3_BR.pdf}
    69 \end{center}
    70 \caption{\CITHREE\ --- Branch Instructions (y-axis: Branches per kB)}
    71 \label{corei3_BR}
    72 \end{figure}
    7378
    7479\begin{figure}
    75 \begin{center}
    76 \includegraphics[width=0.5\textwidth]{plots/corei3_BM.pdf}
    77 \end{center}
    78 \caption{\CITHREE\ --- Branch Mispredictions (y-axis: Branch Mispredictions per kB)}
     80\subfigure[Branch Instructions]{
     81\includegraphics[width=0.45\textwidth]{plots/corei3_BR.pdf}
     82\label{corei3_BR}
     83}
     84\hfill
     85\subfigure[Branch Misses]{
     86\includegraphics[width=0.42\textwidth]{plots/corei3_BM.pdf}
    7987\label{corei3_BM}
     88}
     89\caption{Branch characteristics on the \CITHREE\ per kB of input data.}
    8090\end{figure}
    8191
    8292\subsection{SIMD Instructions vs. Total Instructions}
    8393
    84 Parabix achieves performance via parallel bit stream technology. In Parabix XML processing, parallel bit streams are
    85 both computed and predominately operated upon using the SIMD instructions of commodity processors.  The ratio of
    86 retired SIMD instructions to total instructions provides insight into\ the relative degree to which Parabix achieves parallelism
    87 over the byte-at-a-time approach.
     94Parabix achieves performance via parallel bit stream technology. In
     95Parabix XML processing, parallel bit streams are both computed and
     96predominately operated upon using the SIMD instructions of commodity
     97processors.  The ratio of retired SIMD instructions to total
     98instructions provides insight into\ the relative degree to which
     99Parabix achieves parallelism over the byte-at-a-time approach.
    88100
    89 Using the Intel Pin tool, we gather the dynamic instruction mix for each XML workload, and classify instructions as either vector (SIMD) or non-vector instructions.
    90 Figures \ref{corei3_INS_p1} and \ref{corei3_INS_p2} show the
    91 percentage of SIMD instructions for Parabix1 and Parabix2 respectively.
     101Using the Intel Pin tool, we gather the dynamic instruction mix for
     102each XML workload, and classify instructions as either vector (SIMD)
     103or non-vector instructions.  Figures \ref{corei3_INS_p1} and
     104\ref{corei3_INS_p2} show the percentage of SIMD instructions for
     105Parabix1 and Parabix2 respectively.
    92106%(Expat and Xerce do not use any SIMD instructions)
    93107For Parabix1, 18\% to 40\% of the executed instructions are SIMD instructions.  Using
     
    97111Parabix2 is much lower and thus the performance penalty incurred by
    98112increasing the markup density is reduced.
    99 %Expat and Xerce do not use any SIMD instructions and were not included in this portion of the study.
     113%Expat and Xerce do not use any SIMD instructions and were not
     114%included in this portion of the study.
    100115
    101 % Parabix gains its performance by using parallel bitstreams, which are
    102 % mostly generated and calculated by SIMD instructions.  The ratio of
    103 % executed SIMD instructions over total instructions indicates the
     116% Parabix gains its performance by using parallel bitstreams, which
     117% are mostly generated and calculated by SIMD instructions.  The ratio
     118% of executed SIMD instructions over total instructions indicates the
    104119% amount of parallel processing we were able to achieve.  We use Intel
    105 % pin, a dynamic binary instrumentation tool, to gather instruction mix.
    106 % Then we adds up all the vector instructions that have been executed.
    107 % Figure \ref{corei3_INS_p1} and Figure \ref{corei3_INS_p2} show the
    108 % percentage of SIMD instructions of Parabix1 and Parabix2 (Expat and
    109 % Xerce do not use any SIMD instructions).  For Parabix1, 18\% to 40\%
    110 % of the executed instructions consists of SIMD instructions.  By using
    111 % bistream addition for parallel scanning, Parabix2 uses 60\% to 80\%
    112 % SIMD instructions.  Although the ratio decrease as the markup density
    113 % increase for both Parabix1 and Parabix2, the decreasing rate of
    114 % Parabix2 is much lower and thus the performance degradation caused by
    115 % increasing markup density is smaller.
     120% pin, a dynamic binary instrumentation tool, to gather instruction
     121% mix.  Then we adds up all the vector instructions that have been
     122% executed.  Figure \ref{corei3_INS_p1} and Figure \ref{corei3_INS_p2}
     123% show the percentage of SIMD instructions of Parabix1 and Parabix2
     124% (Expat and Xerce do not use any SIMD instructions).  For Parabix1,
     125% 18\% to 40\% of the executed instructions consists of SIMD
     126% instructions.  By using bistream addition for parallel scanning,
     127% Parabix2 uses 60\% to 80\% SIMD instructions.  Although the ratio
     128% decrease as the markup density increase for both Parabix1 and
     129% Parabix2, the decreasing rate of Parabix2 is much lower and thus the
     130% performance degradation caused by increasing markup density is
     131% smaller.
     132
     133\subsection{CPU Cycles}
     134
     135Figure \ref{corei3_TOT} shows overall parser performance evaluated in
     136terms of CPU cycles per kilobyte.  Parabix1 is 1.5 to 2.5 times faster
     137on document-oriented input and 2 to 3 times faster on data-oriented
     138input than the Expat and Xerces parsers respectively.  Parabix2 is 2.5
     139to 4 times faster on document-oriented input and 4.5 to 7 times faster
     140on data-oriented input.  Traditional parsers can be dramatically
     141slowed by dense markup, while Parabix2 is generally unaffected.  The
     142results presented are not entirely fair to the Xerces parser since it
     143first transcodes input from UTF-8 to UTF-16 before processing. In
     144Xerces, this transcoding requires several cycles per byte.  However,
     145transcoding using parallel bit streams is significantly faster and
     146requires less than a single cycle per byte.  \cite{Cameron2008}.
    116147
    117148
    118149\begin{figure}
    119 \begin{center}
    120 \includegraphics[width=0.5\textwidth]{plots/corei3_INS_p1.pdf}
    121 \end{center}
    122 \caption{Parabix1 --- SIMD vs. Non-SIMD Instructions (y-axis: Percent SIMD Instructions}
    123 \label{corei3_INS_p1}
     150\subfigure[Performance : \# Cycles/kb]{
     151\includegraphics[width=0.5\textwidth]{plots/corei3_TOT.pdf}
     152\label{corei3_TOT}
     153}
     154\hfill
     155\subfigure[SIMD Instruction Breakdown. Y Axis :  \% SIMD Instruction/kb]{
     156\includegraphics[width=0.5\textwidth]{plots/corei3_INS_p2.pdf}
     157\label{corei3_INS_p2}
     158}
    124159\end{figure}
    125160
     161
     162\subsection{Power and Energy}
     163In response to the growing industry concerns on power consumption and
     164energy efficiency, chip producers work hard to not only improve
     165performance but also achieve high energy efficiency in processors
     166design. We study the power and energy consumption of Parabix in
     167comparison with Expat and Xerces on \CITHREE{}. The average power of
     168\CITHREE\ 530 is about 21 watts.  This Intel model has a good
     169reputation for power efficiency. Figure \ref{corei3_power} shows the
     170average power consumed by each parser.  Parabix2, dominated by SIMD
     171instructions, uses approximately 5\% additional power.
     172
     173
     174
     175
    126176\begin{figure}
    127 \begin{center}
    128 \includegraphics[width=0.5\textwidth]{plots/corei3_INS_p2.pdf}
    129 \end{center}
    130 \caption{Parabix2 --- SIMD vs. Non-SIMD Instructions (y-axis: Percent SIMD Instructions)}
    131 \label{corei3_INS_p2}
     177\subfigure[Avg. Power (Watts)]{
     178\includegraphics[width=0.4\textwidth]{plots/corei3_power.pdf}
     179\label{corei3_power}
     180}
     181\hfill
     182\subfigure[Energy Consumption ($\mu$J per kB)]{
     183\includegraphics[width=0.4\textwidth]{plots/corei3_energy.pdf}
     184\label{corei3_energy}
     185}
    132186\end{figure}
    133187
    134 \subsection{CPU Cycles}
     188As shown in Figure \ref{corei3_energy}, a comparison of energy
     189efficiency demonstrates a more interesting result. Although Parabix2
     190requires slightly more power (per instruction), the processing time of
     191Parabix2 is significantly lower, and therefore Parabix2 consumes
     192substantially less energy than the other parsers. Parabix2 consumes 50
     193to 75 nJ per byte while Expat and Xerces consume 80nJ to 320nJ and
     194140nJ to 370nJ per byte respectively.
    135195
    136 Figure \ref{corei3_TOT} shows overall parser performance
    137 evaluated in terms of CPU cycles per kilobyte.  Parabix1 is 1.5 to
    138 2.5 times faster on document-oriented input and 2 to 3 times faster on
    139 data-oriented input than the Expat and Xerces parsers respectively.  Parabix2 is 2.5
    140 to 4 times faster on document-oriented input and 4.5 to 7 times faster
    141 on data-oriented input.  Traditional parsers can be dramatically
    142 slowed by dense markup, while Parabix2 is generally unaffected.  The results presented are not entirely fair to the
    143 Xerces parser since it first transcodes input from UTF-8 to UTF-16 before processing. In Xerces, this transcoding requires
    144 several cycles per byte.  However, transcoding using parallel
    145 bit streams is significantly faster and requires less than a single cycle per byte.
    146 \cite{Cameron2008}.
    147 
    148 \begin{figure}
    149 \begin{center}
    150 \includegraphics[width=0.5\textwidth]{plots/corei3_TOT.pdf}
    151 \end{center}
    152 \caption{\CITHREE\ --- Performance (y-axis: CPU Cycles per kB)}
    153 \label{corei3_TOT}
    154 \end{figure}
    155 
    156 \subsection{Power and Energy}
    157 In response to the growing industry concerns on power consumption and energy efficiency,
    158 chip producers work hard to not only improve performance but
    159 also achieve high energy efficiency in processors design. We study the
    160 power and energy consumption of Parabix in comparison with Expat and
    161 Xerces on \CITHREE{}. The average power of \CITHREE\ 530 is about 21 watts.
    162 This Intel model has a good reputation for power efficiency. Figure \ref{corei3_power} shows the average power consumed by each parser.
    163 Parabix2, dominated by SIMD instructions, uses approximately 5\% additional power.     
    164 
    165 \begin{figure}
    166 \begin{center}
    167 \includegraphics[width=0.5\textwidth]{plots/corei3_power.pdf}
    168 \end{center}
    169 \caption{\CITHREE\ --- Average Power Consumption (watts)}
    170 \label{corei3_power}
    171 \end{figure}
    172 
    173 As shown in Figure \ref{corei3_energy}, a comparison of energy efficiency demonstrates a more interesting result. Although
    174 Parabix2 requires slightly more power (per instruction), the processing time of Parabix2 is significantly lower,
    175 and therefore Parabix2 consumes substantially less energy than the other parsers. Parabix2 consumes 50 to 75
    176 nJ per byte while Expat and Xerces consume 80nJ to 320nJ and 140nJ to 370nJ per byte respectively.
    177 
    178 \begin{figure}
    179 \begin{center}
    180 \includegraphics[width=0.5\textwidth]{plots/corei3_energy.pdf}
    181 \end{center}
    182 \caption{\CITHREE\ --- Energy Consumption ($\mu$J per kB)}
    183 \label{corei3_energy}
    184 \end{figure}
    185 
  • docs/HPCA2012/06-scalability.tex

    r1302 r1335  
    11\section{Scalability}
    22\subsection{Performance}
    3 Figure \ref{Scalability} (a) demonstrates the average XML well-formedness checking performance of Parabix2 for each of the workloads and as executed on each of the processor cores --- \CO\, \CITHREE\ and \SB{}.
    4 Processing time is shown in terms of bit stream based operations executed in `bit-space' and postprocessing operations executed in `byte-space'.
    5 In the Parabix2 parser, bit-space parallel bit stream parser operations consist primarily of SIMD instructions; byte-space operations
    6 consist of byte comparisons across arrays of values. Executing Parabix2 on \CITHREE{} over \CO\ results in an average performance improvement
    7 of 17\% in bit stream processing whereas migrating Parabix2 from \CITHREE{} to \SB{} results in a 22\% average performance gain. Bit space measurements are stable and consistent across each of the source inputs and cores. Postprocessing operations demonstrate data dependent variance. Performance gains from 18\% to 31\% performance are observered in migrating Parabix2 from \CO\ to \CITHREE{}; 0\% to 17\% performance
    8 from \CITHREE\ to \SB{}. For the purpose of comparison, Figure \ref{Scalability} (b) shows the performance of the Expat parser on each of the processor cores.
    9 A performance improvement of less than 5\% is observed when executing Expat on \CITHREE\ over \CO\
    10 and less than 10\% on \SB\ over \CITHREE{}.
     3Figure \ref{Scalability} (a) demonstrates the average XML
     4well-formedness checking performance of Parabix2 for each of the
     5workloads and as executed on each of the processor cores --- \CO\,
     6\CITHREE\ and \SB{}.  Processing time is shown in terms of bit stream
     7based operations executed in `bit-space' and postprocessing operations
     8executed in `byte-space'.  In the Parabix2 parser, bit-space parallel
     9bit stream parser operations consist primarily of SIMD instructions;
     10byte-space operations consist of byte comparisons across arrays of
     11values. Executing Parabix2 on \CITHREE{} over \CO\ results in an
     12average performance improvement of 17\% in bit stream processing
     13whereas migrating Parabix2 from \CITHREE{} to \SB{} results in a 22\%
     14average performance gain. Bit space measurements are stable and
     15consistent across each of the source inputs and cores. Postprocessing
     16operations demonstrate data dependent variance. Performance gains from
     1718\% to 31\% performance are observered in migrating Parabix2 from
     18\CO\ to \CITHREE{}; 0\% to 17\% performance from \CITHREE\ to
     19\SB{}. For the purpose of comparison, Figure \ref{Scalability} (b)
     20shows the performance of the Expat parser on each of the processor
     21cores.  A performance improvement of less than 5\% is observed when
     22executing Expat on \CITHREE\ over \CO\ and less than 10\% on \SB\ over
     23\CITHREE{}.
    1124
    12 Overall, Parabix2 scales better than Expat. Simply executing identical Parabix2 object code on \SB\ results in an overall performance improvement
    13 up to 26\%. Additional performance aspects of Parabix2 on \SB\ with AVX instructions are discussed in the following sections.
     25Overall, Parabix2 scales better than Expat. Simply executing identical
     26Parabix2 object code on \SB\ results in an overall performance
     27improvement up to 26\%. Additional performance aspects of Parabix2 on
     28\SB\ with AVX instructions are discussed in the following sections.
    1429
    1530\begin{figure}
     
    2843\subsection{Power and Energy}
    2944
    30 Figure \ref{power_Parabix2} shows the average power consumption of Parabix2 over each workload and as executed on each of the processor cores --- \CO{}, \CITHREE\ and \SB{}.
    31 Average power consumption on \CO{} is 32 watts. Execution on \CITHREE\ results in 30\% power saving over \CO{}.
    32 \SB\ saves 25\% of the power compared with \CITHREE\ and consumes only 15 watts.
     45Figure \ref{power_Parabix2} shows the average power consumption of
     46Parabix2 over each workload and as executed on each of the processor
     47cores --- \CO{}, \CITHREE\ and \SB{}.  Average power consumption on
     48\CO{} is 32 watts. Execution on \CITHREE\ results in 30\% power saving
     49over \CO{}.  \SB\ saves 25\% of the power compared with \CITHREE\ and
     50consumes only 15 watts.
    3351
    3452In XML parsing we observe energy consumption is dependent on processing time. That is, a reduction in processing time results in a directly proportional reduction in energy consumption.
    3553With newer processor cores comes improvements in application performance. As a result, Parabix2 executed on \SB\ consumes 72\% to 75\% less energy than Parabix2 on \CO{}.
    3654
    37 \begin{figure}
    38 \begin{center}
    39 \includegraphics[width=85mm]{plots/power_Parabix2.pdf}
    40 \end{center}
    41 \caption{Average Power of Parabix2 (watts)}
    42 \label{power_Parabix2}
    43 \end{figure}
     55
     56
    4457
    4558\begin{figure}
    46 \begin{center}
     59\centering
     60\subfigure[Avg. Power of Parabix on various hardware (Watts)]{
     61\includegraphics[width=85mm]{plots/power_Parabix2.pdf}
     62\label{power_Parabix2}
     63}
     64\hfill
     65\centering
     66\subfigure[Avg. Energy Consumption on various hardware (nJ per kB)]{
    4767\includegraphics[width=85mm]{plots/energy_Parabix2.pdf}
    48 \end{center}
    49 \caption{Energy consumption of Parabix2 (nJ/B)}
    5068\label{energy_Parabix2}
     69}
    5170\end{figure}
  • docs/HPCA2012/07-avx.tex

    r1302 r1335  
    4141\subsection{256-bit AVX Operations}
    4242
    43 With the introduction of 256-bit SIMD registers, and under ideal conditions, one would anticipate a corresponding
    44 50\% reduction in the SIMD instruction count of Parabix2 on AVX.  However, in the \SB\ AVX
    45 implementation, Intel has focused primarily on floating point operations
    46 as opposed to the integer based operations. 
    47 256-bit SIMD is available for loads, stores, bitwise logic and
    48 floating operations, whereas SIMD integer operations and shifts are
    49 only available in the 128-bit form.  Nevertheless, with loads, stores
    50 and bitwise logic comprising a major portion of the Parabix2
    51 SIMD instruction mix, a substantial reduction in instruction count
    52 and consequent performance improvement was anticipated but not achieved.
     43With the introduction of 256-bit SIMD registers, and under ideal
     44conditions, one would anticipate a corresponding 50\% reduction in the
     45SIMD instruction count of Parabix2 on AVX.  However, in the \SB\ AVX
     46implementation, Intel has focused primarily on floating point
     47operations as opposed to the integer based operations.  256-bit SIMD
     48is available for loads, stores, bitwise logic and floating operations,
     49whereas SIMD integer operations and shifts are only available in the
     50128-bit form.  Nevertheless, with loads, stores and bitwise logic
     51comprising a major portion of the Parabix2 SIMD instruction mix, a
     52substantial reduction in instruction count and consequent performance
     53improvement was anticipated but not achieved.
    5354
    5455\subsection{Performance Results}
     
    7879256-bit AVX technology.
    7980
    80 Note that, in each workload, the number of non-SIMD instructions
    81 remains relatively constant with each workload.  As may be
    82 expected, however, the number of ``bitwise SIMD'' operations
    83 remains the same for both SSE and 128-bit while dropping
    84 dramatically when operating 256-bits at a time.   Ideally
    85 one one may expect up to a 50\% reduction in these instructions versus
    86 the 128-bit AVX.  The actual reduction measured was 32\%--39\%
    87 depending on workload.   Because some bitwise logic is needed
    88 in implementation of simulated 256-bit operations, the full 50\%
    89 reduction in bitwise logic was not achieved.
     81Note that, in each workload, the number of non-SIMD instructions
     82remains relatively constant with each workload.  As may be expected,
     83however, the number of ``bitwise SIMD'' operations remains the same
     84for both SSE and 128-bit while dropping dramatically when operating
     85256-bits at a time.  Ideally one one may expect up to a 50\% reduction
     86in these instructions versus the 128-bit AVX.  The actual reduction
     87measured was 32\%--39\% depending on workload.  Because some bitwise
     88logic is needed in implementation of simulated 256-bit operations, the
     89full 50\% reduction in bitwise logic was not achieved.
    9090
    9191The ``other SIMD'' class shows a substantial 30\%-35\% reduction
     
    9898While the successive reductions in SIMD instruction counts are quite
    9999dramatic with the two AVX implementations of Parabix2, the performance
    100 benefits are another story.   As shown in Figure \ref{avx}, the
    101 benefits of the reduced SIMD instruction count are achieved only
    102 in the AVX 128-bit version.  In this case, the benefits of 3-operand
    103 form seem to fully translate to performance benefits. 
    104 Based on the reduction of overall Bitwise-SIMD instructions we expected a 11\% improvement in performance.
    105 Instead, perhaps bizzarely, the performance of Parabix2 in the 256-bit AVX implementation
    106 does not improve significantly and actually degrades for files with
    107 higher markup density (average 10\%). Dewiki.xml, on which bitwise-SIMD instructions reduced by 39\%,  saw a performance improvement of 8\%.
    108 We believe that this is primarily due to the intricacies of the first generation AVX implemention in \SB{},
    109 with significant latency in many of the 256-bit instructions in comparison to their
    110 128-bit counterparts. The 256-bit instructions also have different scheduling constraints that seem to reduce overall SIMD throughput.   If these latency issues can be addressed
    111 in future AVX implementations, further substantial performance and energy benefits could be realized in XML parsing with Parabix2.
     100benefits are another story.  As shown in Figure \ref{avx}, the
     101benefits of the reduced SIMD instruction count are achieved only in
     102the AVX 128-bit version.  In this case, the benefits of 3-operand form
     103seem to fully translate to performance benefits.  Based on the
     104reduction of overall Bitwise-SIMD instructions we expected a 11\%
     105improvement in performance.  Instead, perhaps bizzarely, the
     106performance of Parabix2 in the 256-bit AVX implementation does not
     107improve significantly and actually degrades for files with higher
     108markup density (average 10\%). Dewiki.xml, on which bitwise-SIMD
     109instructions reduced by 39\%, saw a performance improvement of 8\%.
     110We believe that this is primarily due to the intricacies of the first
     111generation AVX implemention in \SB{}, with significant latency in many
     112of the 256-bit instructions in comparison to their 128-bit
     113counterparts. The 256-bit instructions also have different scheduling
     114constraints that seem to reduce overall SIMD throughput.  If these
     115latency issues can be addressed in future AVX implementations, further
     116substantial performance and energy benefits could be realized in XML
     117parsing with Parabix2.
  • docs/HPCA2012/08-arm.tex

    r1302 r1335  
    11\def\CORTEXA8{Cortex-A8}
    22
    3 \section {Parabix2 on GT-P1000M}
     3\section {Parabix on Mobile Platforms}
    44
    5 The Samsung Galaxy Tab GT-P1000M device houses a Samsung S5PC110 ARM \CORTEXA8{} single-core, dual-issue, superscalar microprocessor. In addition to the standard feature set found in such low-power 32-bit microprocessors, the S5PC110 includes the ARM NEON general-purpose SIMD engine. ARM NEON makes available a 128-bit SIMD instruction set similar in functionality to Intel SSE3 instruction set. In this section, we present our performance comparison of a NEON-based port of Parabix2 versus the Expat parser, and executed on the Samsung Galaxy Tab GT-P1000M hardware. Parabix1 and Xerces are excluded from this portion of our study due to the complexity of the cross-platform build process in porting native C/C++ applications to the Android platform.
     5The Samsung Galaxy Tab GT-P1000M device houses a Samsung S5PC110 ARM
     6\CORTEXA8{} 1Ghz single-core, dual-issue, superscalar
     7microprocessor. It includes a 32kB L1 data cache and a 512kB L2 shared
     8cache. In addition to the standard feature set found in such low-power
     932-bit microprocessors, the S5PC110 includes the ARM NEON
     10general-purpose SIMD engine. ARM NEON makes available a 128-bit SIMD
     11instruction set similar in functionality to Intel SSE3 instruction
     12set. In this section, we present our performance comparison of a
     13NEON-based port of Parabix2 versus the Expat parser, and executed on
     14the Samsung Galaxy Tab GT-P1000M hardware.  Xerces is excluded from
     15this portion of our study due to the complexity of the cross-platform
     16build process in porting native C/C++ applications to the Android
     17platform.
    618
    7 \subsection{Platform Hardware}
    8 %\paragraph{GT-P1000M}
    9 Samsung Galaxy Tab GT-P1000M was produced by Samsung and incorporates the ARM
    10 \CORTEXA8{} microprocessor. Table \ref{arminfo} describes the Samsung Galaxy Tab GT-P1000M hardware.
    11 
    12 \begin{table}[h]
    13 \begin{center}
    14 \begin{tabular}{|l||l|}
    15 \hline
    16 Processor & ARM \CORTEXA8{} (1GHz) \\ \hline
    17 L1 Cache & 32kB I-Cache, 32kB D-Cache \\ \hline
    18 L2 Cache & 512kB \\ \hline
    19 Flash & 16GB \\ \hline
    20 \end{tabular}
    21 \end{center}
    22 \caption{GT-P1000M}
    23 \label{arminfo}
    24 \end{table}
    2519
    2620\subsection{Performance Results}
    2721
    2822\begin{figure}
    29 \begin{center}
     23\subfigure[ARM Neon Performance]{
    3024\includegraphics[width=0.5\textwidth]{plots/arm_TOT.pdf}
    31 \end{center}
    32 \caption{Parabix2 Performance on GT-P1000M (y-axis: CPU Cycles per kB)}
    3325\label{arm_processing_time}
     26}
     27\hfill
     28\subfigure[Performance ARM Neon vs Core i3 SSE.]{
     29\includegraphics[width=0.5\textwidth]{plots/RelativePerformanceARMvsCoreI3.pdf}
     30\label{relative_performance_arm_vs_i3}
     31}
    3432\end{figure}
    3533
    36 Migration of Parabix2 to the Android platform began with the retargetting of a subset of the Parabix2 IDISA SIMD library for ARM NEON.
    37 This library code was cross-compiled for Android using the Android NDK. The Android NDK is a companion tool to the Android SDK
    38 that allows developers to build performance-critical portions of applications in native code. The majority of the Parabix2 SIMD functionality ported directly. However, for a small subset of
    39 the SIMD functions of Parabix2 NEON equivalents did not exist. In such cases we simply simulated logical equivalencies using the available the instruction set.
    40 
    41 A comparison of Figure \ref{arm_processing_time} and Figure \ref{corei3_TOT} demonstrates that the performance of
    42 both Parabix2 and Expat degrades substantially on \CORTEXA8{}.  This result was expected given the combarably performance limited \CORTEXA8{} hardware architecture.  Surprisingly on \CORTEXA8{}  Expat outperforms Parabix2 on each of the lower markup density workloads, dew.xml and jaw.xm. On the remaining higher-density workloads, Parabix2 performs only moderately better than Expat.
    43 The higher latency of the NEON instructions on \CORTEXA8{} is the likely contributor to this loss in performance. A more interesting aspect of this result is demonstrated in a comparison of Figure
    44 \ref{relative_performance_arm_vs_i3} and Figure \ref{relative_performance_arm_vs_i3}. These figure demonstrate that the relative performance of each parser degrades in a relatively constant manner.
    45 That is, compared to the \CITHREE{}, on the GT-P1000M, Parabix2 and Expat operate at approximately 17.2\% and
    46 55.7\% efficiency respectively. Figure \ref{relative_performance_arm_vs_i3} shows that the baseline cost of Parabix2 operations implemented using the NEON instruction set---
    47 and thereby the baseline cost of Parabix2---is substantially higher on the \CORTEXA8{} processor.
    48 Given that Parabix2 was not designed with the limitations of the \CORTEXA8{} in mind, in the future a
    49 careful analysis of the cost of each instruction provided in the ARMv7 ISA may allow us to better utilize
    50 the hardware resources provided. In particular, future performance enhancement to ARM NEON could result in substantial overall improvement in Parabix2 execution time.
    51 
    52 \begin{figure}
    53 \begin{center}
    54 \includegraphics[width=0.5\textwidth]{plots/RelativePerformanceARMvsCoreI3.pdf}
    55 \end{center}
    56 \caption{Relative Slow Down of Parbix2 and Expat on GT-P1000M vs. \CITHREE{} }
    57 \label{relative_performance_arm_vs_i3}
    58 \end{figure}
     34Migration of Parabix2 to the Android platform began with the
     35retargetting of a subset of the Parabix2 IDISA SIMD library for ARM
     36NEON.  This library code was cross-compiled for Android using the
     37Android NDK. The Android NDK is a companion tool to the Android SDK
     38that allows developers to build performance-critical portions of
     39applications in native code. The majority of the Parabix2 SIMD
     40functionality ported directly. However, for a small subset of the SIMD
     41functions of Parabix2 NEON equivalents did not exist. In such cases we
     42simply simulated logical equivalencies using the available the
     43instruction set.
    5944
    6045
     46
     47A comparison of Figure \ref{arm_processing_time} and Figure
     48\ref{corei3_TOT} demonstrates that the performance of both Parabix2
     49and Expat degrades substantially on \CORTEXA8{}.  This result was
     50expected given the combarably performance limited \CORTEXA8{} hardware
     51architecture.  Surprisingly on \CORTEXA8{} Expat outperforms Parabix2
     52on each of the lower markup density workloads, dew.xml and jaw.xm. On
     53the remaining higher-density workloads, Parabix2 performs only
     54moderately better than Expat.  The higher latency of the NEON
     55instructions on \CORTEXA8{} is the likely contributor to this loss in
     56performance. A more interesting aspect of this result is demonstrated
     57in a comparison of Figure \ref{relative_performance_arm_vs_i3} and
     58Figure \ref{relative_performance_arm_vs_i3}. These figure demonstrate
     59that the relative performance of each parser degrades in a relatively
     60constant manner.  That is, compared to the \CITHREE{}, on the
     61GT-P1000M, Parabix2 and Expat operate at approximately 17.2\% and
     6255.7\% efficiency respectively. Figure
     63\ref{relative_performance_arm_vs_i3} shows that the baseline cost of
     64Parabix2 operations implemented using the NEON instruction set--- and
     65thereby the baseline cost of Parabix2---is substantially higher on the
     66\CORTEXA8{} processor.  Given that Parabix2 was not designed with the
     67limitations of the \CORTEXA8{} in mind, in the future a careful
     68analysis of the cost of each instruction provided in the ARMv7 ISA may
     69allow us to better utilize the hardware resources provided. In
     70particular, future performance enhancement to ARM NEON could result in
     71substantial overall improvement in Parabix2 execution time.
     72
     73
  • docs/HPCA2012/09-pipeline.tex

    r1331 r1335  
    3131
    3232\begin{table*}[t]
     33{
     34\centering
     35\footnotesize
    3336\begin{center}
    34 \begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|c|}
     37\begin{tabular}{|c|@{~}c@{~}|c|@{~}c@{~}|c@{~}|@{~}c@{~}|c|@{~}c@{~}|c|@{~}c@{~}|c|@{~}c@{~}|}
    3538\hline
    3639       & & \multicolumn{10}{|c|}{Data Structures}\\ \hline
     
    5053\end{center}
    5154\caption{Relationship between Each Pass and Data Structures}
    52 \label{pass_structure}
     55\label{pass_structure}
     56}
    5357\end{table*}
    5458
     
    5761The multi-threaded Parabix is more than two times faster and runs at 2.7 cycles per input byte on the \SB{} machine.
    5862
    59 \begin{figure}
    60 \begin{center}
    61 \includegraphics[width=0.5\textwidth]{plots/performance.pdf}
    62 \end{center}
    63 \caption{Processing Time (y axis: CPU cycles per byte)}
    64 \label{multithread_perf}
    65 \end{figure}
    6663
    6764Figure \ref{power} shows the average power consumed by the multi-threaded Parabix in comparison with the single-threaded version.
     
    7168
    7269\begin{figure}
    73 \begin{center}
    74 \includegraphics[width=0.5\textwidth]{plots/power.pdf}
    75 \end{center}
    76 \caption{Average Power (watts)}
     70\subfigure[Performance (Cycles / Byte)]{
     71\includegraphics[width=0.32\textwidth]{plots/performance.pdf}
     72\label{performance}
     73}
     74\subfigure[Avg. Power Consumption]{
     75\includegraphics[width=0.32\textwidth]{plots/power.pdf}
    7776\label{power}
    78 \end{figure}
    79 \begin{figure}
    80 \begin{center}
    81 \includegraphics[width=0.5\textwidth]{plots/energy.pdf}
    82 \end{center}
    83 \caption{Energy Consumption (nJ per byte)}
     77}
     78\subfigure[Avg. Energy Consumption (nJ / Byte)]{
     79  \includegraphics[width=0.32\textwidth]{plots/energy.pdf}
    8480\label{energy}
     81}
     82\caption{Multithreaded Parabix}
     83\label{multithread_perf}
    8584\end{figure}
    8685
  • docs/HPCA2012/latex/iccv.sty

    r1327 r1335  
    8989   \newpage
    9090   \null
    91    \vskip .375in
     91 %  \vskip .375in
    9292   \begin{center}
    9393      {\Large \bf \@title \par}
  • docs/HPCA2012/main.tex

    r1331 r1335  
    185185\input{10-conclusions.tex}
    186186% tighten spacing:
     187
     188
    187189\let\oldthebibliography\thebibliography
    188190\def\thebibliography#1{\oldthebibliography{#1}\parsep-5pt\itemsep0pt}
    189 % \vspace{-\baselineskip}
    190191{
    191192\setstretch{1}
    192193 \footnotesize
    193 % \scriptsize
    194194\bibliographystyle{abbrv}
    195195 \bibliography{reference}
  • docs/HPCA2012/preamble-submit.tex

    r1326 r1335  
    3333
    3434
     35
    3536% \iccvfinalcopy % *** Uncomment this line for the final submission
    3637
     
    5960\marginparsep 0in
    6061\marginparwidth 0in
    61 \topmargin -0.4in
     62\topmargin -0.2in
    6263%\headheight 0in
    6364%\headsep 0in
    6465%\footskip 0.3in
    65 \textheight 9.5in
     66\textheight 9.2in
    6667%\textfloatsep 0.1in
    6768%\floatsep 0.1in
Note: See TracChangeset for help on using the changeset viewer.