Changeset 255 for docs/ASPLOS09


Ignore:
Timestamp:
Jan 4, 2009, 3:54:20 PM (11 years ago)
Author:
cameron
Message:

References, cleanups

Location:
docs/ASPLOS09
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • docs/ASPLOS09/asplos094-cameron.bib

    r249 r255  
    11
    22@techreport{u8u16,
    3 author = "{Cameron, Robert D.}",
    4 title = "{{\tt u8u16} -- A High-Speed UTF-8 to UTF-16 Transcoder Using
    5 Parallel Bit Streams}",
     3author = {Cameron, Robert D.},
     4title = "{\tt u8u16} -- A High-Speed {UTF-8 to UTF-16} Transcoder Using
     5Parallel Bit Streams",
    66type={Technical Report},
    77number={TR 2007-18},
     
    1212
    1313@inproceedings{PPoPP08,
    14 author = "{Cameron, Robert D.}",
    15 title = "{A Case Study in SIMD Text Processing with Parallel Bit Streams}",
     14author = {Cameron, Robert D.},
     15title = "A Case Study in {SIMD} Text Processing with Parallel Bit Streams",
    1616booktitle = "{ACM Symposium on Principles and Practice of Parallel Programming (PPoPP)}",
    1717mon=feb, year=2008, address="{Salt Lake City, Utah}"
     
    1919
    2020@inproceedings{CASCON08,
    21 author = "{Cameron, Robert D., Herdy, Kenneth S. and Lin, Dan}",
    22 title =  "{High Performance XML Parsing Using Parallel Bit Stream Technology}",
     21author = {Cameron, Robert D. and Herdy, Kenneth S. and Lin, Dan},
     22title =  "High Performance {XML} Parsing Using Parallel Bit Stream Technology",
    2323booktitle = "{CASCON '08: Proceedings of the 2008 conference of the Centre for Advanced Studies on Collaborative research}",
    2424mon=oct, year = 2008, address="{Toronto, Ontario , Canada}"
     
    2626
    2727@inproceedings{Herdy,
    28 author = "{Herdy, Kenneth S., Burggraf, David S., and Cameron, Robert D.}",
    29 title =  "{High Performance GML to SVG Transformation for the Visual Presentation of Geographic Data in Web-Based Mapping Systems}",
     28author = {Herdy, Kenneth S. and Burggraf, David S. and Cameron, Robert D.},
     29title =  "High Performance {GML to SVG} Transformation for the Visual Presentation of Geographic Data in Web-Based Mapping Systems",
    3030booktitle = "{Proceedings of SVG Open 2008}",
    3131mon=aug, year = 2008, address="{Nuremburg, Germany}"
     
    3333
    3434@inproceedings{Green,
    35 author = "{Green, James R., Mahmoud, and Dumontier, Michel}",
    36 title =  "{Towards Real Time Protein Identification using the Cell BE}",
    37 booktitle = "{Workshop on Cell BE and Heterogeneous Multicore Systems: Architectures and Applications at
    38 CASCON '08}",
     35author = {Green, James R. and Mahmoud, Hanan and Dumontier, Michel},
     36title =  "Towards Real Time Protein Identification using the {Cell BE}",
     37booktitle = "Workshop on Cell BE and Heterogeneous Multicore Systems: Architectures and Applications at
     38CASCON '08",
    3939mon=oct, year = 2008, address="{Toronto, Ontario , Canada}"
    4040}
     
    4444
    4545@inproceedings{Nuzman,
    46 author = "{Nuzman, Dorit, Rosen, Ira and Zaks, Ayal}",
    47 title = "{Auto-Vectorization of Interleaved Data for SIMD}",
     46author = {Nuzman, Dorit and Rosen, Ira and Zaks, Ayal},
     47title = {Auto-Vectorization of Interleaved Data for {SIMD}},
    4848booktitle = "{PLDI '06: Proceedings of the 2006 ACM SIGPLAN conference on Programming language design and implementation}",
    4949address = "{Ottawa, Ontaria, Canada}", year = 2006, mon=jun
    5050}
     51
     52
     53@book{HackersDelight,
     54author = {Henry S. Warren},     
     55title = {Hacker's Delight},
     56publisher = "Addison-Wesley",
     57year = 2002
     58}
     59
     60
     61@article{Raman08,
     62author = {R. Raman and D.S. Wise},
     63title = "Converting to and from Dilated Integers",
     64journal = "{IEEE Transactions on Computers}", pages = "567--573",
     65mon=apr, year=2008, volume="57", number="4"
     66}
     67
     68@inproceedings{Stocco95,
     69author = {Leo Stocco and G\"{u}nther Schrack},
     70title = "Integer Dilation and Contraction for Quadtrees and Octrees",
     71booktitle = "{Proceedings of the IEEE Pacific Rim Conference on Communications, Computers, and Signal Processing}", pages = "426--428",
     72mon=May, year=1995, address="{Victoria, B.C.}"
     73}
     74
     75@inproceedings{DBLP:conf/cans/RebeiroSD06,
     76  author    = {Chester Rebeiro and
     77               A. David Selvakumar and
     78               A. S. L. Devi},
     79  title     = {Bitslice Implementation of AES},
     80  booktitle = {CANS},
     81  year      = {2006},
     82  pages     = {203-212},
     83  ee        = {http://dx.doi.org/10.1007/11935070_14},
     84  crossref  = {DBLP:conf/cans/2006},
     85  bibsource = {DBLP, http://dblp.uni-trier.de}
     86}
     87
     88@proceedings{DBLP:conf/cans/2006,
     89  editor    = {David Pointcheval and
     90               Yi Mu and
     91               Kefei Chen},
     92  title     = {Cryptology and Network Security, 5th International Conference,
     93               CANS 2006, Suzhou, China, December 8-10, 2006, Proceedings},
     94  booktitle = {CANS},
     95  publisher = {Springer},
     96  series    = {Lecture Notes in Computer Science},
     97  volume    = {4301},
     98  year      = {2006},
     99  isbn      = {3-540-49462-6},
     100  bibsource = {DBLP, http://dblp.uni-trier.de}
     101}
     102
     103@inproceedings{bitsliceAES,
     104author={Chester Rebeiro and A. David Selvakumar and A. S. L. Devi},
     105title= "Bitslice Implementation of {AES}",
     106
     107pages="203--212"}
     108
     109
     110
     111@misc{v4pf1a,
     112author={Donald E. Knuth},
     113title="The Art of Computer Programming Volume 4 Pre-Fascicle 1A: Bitwise Tricks and Techniques",
     114howpublished="Draft of 22 December 2008, Stanford University"
     115}
     116
     117
    51118
    52119@techreport{Landscape,
     
    60127}
    61128
    62 @inproceedings{ApparaoBhat04,
    63 author = {Apparao, P. and Bhat, M.},
    64 title = "{A Detailed Look at the Characteristics of XML Parsing}",
    65 booktitle = {Proceedings of the 1st Workshop on Building Block Engine Architectures
    66 for Computers and Networks (BEACON '04)},
    67 address = {Boston, MA},
    68 month=oct, year=2004}
    69 
    70 @inproceedings{XMLScreamer,
    71 author = "{Kostoulas, M. G., Matsa, M., Mendelsohn, N., Perkins, E., Heifets, A., and Mercaldi, M.}",
    72 title = "{XML Screamer: An Integrated Approach to High Performance XML Parsing, Validation and Deserialization}",
    73 booktitle = {Proceedings of the 15th International Conference on World Wide Web (WWW '06)},
    74 pages = {93--102}, year=2006
    75 }
    76 
    77 @inproceedings{NicolaJohn03,
    78 author = "{Nicola, Matthias, and John, Jasmi}",
    79 title = "{XML Parsing: A Threat to Database Performance}",
    80 booktitle = {Proceedings of the Twelfth International Conference on Information and
    81 Knowledge Management},
    82 address = {New Orleans, Louisiana}, year=2003}
    83 
    84 @inproceedings{Perkins05,
    85 author = "{Perkins, E., Kostoulas, M., Heifets, A., Matsa, M., and Mendelsohn, N.}",
    86 title = "{Performance Analysis of XML APIs}",
    87 booktitle = {XML 2005}, address = {Atlanta, Georgia}, month=nov, year=2005}
    88 
    89 @inproceedings{Psaila06,
    90 author = "{Psaila, Giuseppe}",
    91 title = "{On the Problem of Coupling Java Algorithms and XML Parsers}",
    92 booktitle = {17th International Conference on Database and Expert Systems Applications (DEXA'06)},
    93 mon=sep, year=2006, pages = {487--491}
    94 }
    95 
    96 
    97 @inproceedings{ZhaoBhuyan06,
    98 author = "{Zhao, Li \and Laxmi Bhuyan}",
    99 title = "{Performance Evaluation and Acceleration for XML Data Parsing}",
    100 booktitle = {9th Workshop on Computer Architecture Evaluation using Commercial Workloads (CAECW)},
    101 mon=feb, year=2006, address="{Austin, Texas}"
    102 }
    103 
    104 @inproceedings{DuCharme04,
    105 author = "{DuCharme, Bob}",
    106 title = "Documents vs. Data, Schemas vs. Schemas",
    107 booktitle = "{XML 2004}",
    108 mon=nov, year=2004, address="{Washington D.C.}"
    109 }
    110 
    111 @inbook{GML04,
    112   author = "{Ron Lake, David Burggraf, Milan Trninic, and Laurie Rae}",
    113   title = "{Geography Mark-Up Language: Foundation for the Geo-Web}",
    114   publisher = "John Wiley \& Sons, Inc.",
    115   year = 2004,
    116   pages = "3--4"
    117 }
    118 
    119 @misc{expat,
    120   author = "{James Clark}",
    121   title  = "{The Expat XML Parser}",
    122   howpublished = "{http://expat.sourceforge.net/}"
    123 }
    124 
    125 @misc{xerces,
    126   title = "{Xerces C++ Parser}",
    127   howpublished = "{http://xerces.apache.org/xerces-c/}"
    128 }
    129 
    130 @misc{papi,
    131   title = "{Performance Application Programming Interface}",
    132   howpublished = "{http://icl.cs.utk.edu/papi/}"
    133 }
    134 
    135 @misc{perfctr,
    136   author = "{Michael Pettersson}",     
    137   title = "{Linux x86 Performance-Monitoring Counters Driver}",
    138   howpublished = "{http://user.it.uu.se/~mikpe/linux/perfctr}"
    139 }
    140 
    141 @manual{IntelArchOptRefMan,
    142   title = "{IA-32 Intel Architecture Optimization Reference Manual}",
    143   organization = "{Intel Corporation}",
    144   year = 2005,
    145 }
    146 
    147 @book{HackersDelight,
    148 author = "{Henry S. Warren}",   
    149 title = "{Hacker's Delight}",
    150 publisher = "Addison-Wesley",
    151 year = 2002
    152 }
    153 
    154 
    155 @article{Raman08,
    156 author = "{R Raman and DS Wise}",
    157 title = "Converting to and from Dilated Integers",
    158 journal = "{IEEE Transactions on Computers}", pages = "567--573",
    159 mon=apr, year=2008, volume="57", number="4"
    160 }
    161 
    162 @inproceedings{Stocco95,
    163 author = "{Leo Stocco, and Gunther Schrack}",
    164 title = "Integer Dilation and Contraction for Quadtrees and Octrees",
    165 booktitle = "{Proceedings of the IEEE Pacific Rim Conference on Communications, Computers, and Signal Processing}", pages = "426--428",
    166 mon=May, year=1995, address="{Victoria, B.C.}"
    167 }
  • docs/ASPLOS09/asplos094-cameron.tex

    r254 r255  
    3232\authorinfo{Robert D. Cameron \and Dan Lin}
    3333           {School of Computing Science, Simon Fraser University}
    34            {\{cameron, lindanl\}@cs.sfu.ca}
     34           {\tt \{cameron, lindanl\}@cs.sfu.ca}
    3535
    3636
     
    159159design.  First we introduce a simple model and notation for
    160160SWAR operations in general and then present the four key
    161 features of an idealized architecture in support of parallel
    162 bit stream programming.
     161features of IDISA.
    163162
    164163
     
    190189the simultaneous calculation of individual field values in
    191190accord with the following equation.
    192 \[ r_i = F_n(a_i, b_i) \]
     191\begin{eqnarray}
     192 r_i &=& F_n(a_i, b_i)
     193\end{eqnarray}
    193194
    194195For example, addition(\verb:add:), subtraction (\verb:sub:) and shift left
     
    200201\mbox{sub}_n(a,b) & = & (a-b+2^n) \bmod 2^n \\
    201202\mbox{sll}_n(a,b) & = & a \times 2^{b \bmod n} \bmod 2^n
    202 \end{eqnarray}Throughout this paper, we focus on SWAR implementations using a {\em
    203 three-register model} involving two input registers
    204 and one output register, each of size $N=2^K$ bits.
    205 %\doublespace
    206 
    207 The SSE and Altivec instruction sets support
    208 each of the addition and subtraction operations in SWAR form
    209 for 8, 16 and 32-bit fields, while the SSE set also includes
    210 the 64-bit forms.  For example, \verb#simd<8>::add# in our
    211 nomenclature is provided by the operation \verb:paddb:
    212 on SSE and the operation \verb:vaddubm: on Altivec.
    213 The equivalents of \verb#simd<8>::sll#, \verb#simd<16>::sll#, and
    214 \verb#simd<32>::sll# are avilable on Altivec; the SSE facilities are
    215 more constrained, requiring that all field shifts by the same amount.
     203\end{eqnarray}
     204
     205The Altivec instruction set includes each of these operations
     206for 8, 16 and 32-bit fields directly following the three-register
     207model.   The SSE set uses a two-register model with the result
     208being copied back to one of the input operands.   However, the
     209C language intrinsics commonly used to access these instructions
     210reflect the three-register model.  The SSE set extends these
     211operations to include operations on 64-bit fields, but
     212constrains the shift instructions, requiring that all field shifts by the same amount.
    216213
    217214Given these definitions and notation, we now present
     
    230227
    231228For the purpose of direct and efficient support for inductive
    232 doubling algorithms on bit streams, the provision of
     229doubling algorithms, the provision of
    233230a core set of operations at field widths of 2 and 4 as
    234231well as the more traditional field widths of 8, 16 and 32
    235 is critical for elegant and efficient expression of the
    236 algorithms.  In essence, inductive doubling algorithms
     232is key.  In essence, inductive doubling algorithms
    237233work by establishing some base property at either single
    238234or 2-bit fields.  Each iteration of the algorithm then
     
    305301operations.  Thus packing with conversion by masking off all
    306302but the low $n/2$ bits of each field may be
    307 be performed using the operation \verb#simd<n>::pack<l,l>#
    308 
     303be performed using the operation \verb#simd<n>::pack<l,l>#.
    309304
    310305The final facility of the inductive doubling architecture is
    311 a set of merging operations \verb#simd<n>::mergeh# and
    312 \verb#simd<n>::mergel# that produce $2n$-bit fields
     306a set of merging operations that produce $2n$-bit fields
    313307by concatenating corresponding $n$-bit fields from the
    314 operand registers.   The respective
     308operand registers.   The
    315309operations \verb#r=simd<n>::mergeh(a,b)# and
    316310\verb#s=simd<n>::mergel(a,b)#
     
    364358a constant load operation \verb#simd::constant<n>(c)#
    365359loads the constant value $c$ into each $n$ bit field.
     360The pack and merge facilities of SSE will also be assumed.
    366361
    367362Reference architecture B (RefB) consists of a register-rich
     
    372367logical instruction \verb#simd<n>::rotl(a,b)#  rotates each field
    373368of $a$ by the rotation count in the corresponding field of $b$.
    374 A three-register \verb#simd<1>::if(a,b,c)# bitwise logical
    375 operator implements the logic $a \wedge b \vee \neg a \wedge c$, patterned
     369A three-input \verb#simd<1>::if(a,b,c)# bitwise logical
     370operator implements the logic $r=a \wedge b \vee \neg a \wedge c$, patterned
    376371after the Altivec \verb:vec_sel: operation.  Finally,
    377372a \verb#simd<8>::permute(a,b,c)# selects an arbitrary
     
    396391access.  This assumption makes for
    397392straightforward performance evaluation based on instruction
    398 count for straight-line computational kernels.  Furthermore,
    399 the assumption also eliminates artifacts due to possibly different
    400 latencies in reference and IDISA architectures.  Because
    401 the same assumption is made for reference and IDISA
     393count for straight-line computational kernels. 
     394Furthermore, the assumption also eliminates artifacts due to
     395possibly different latencies in reference and IDISA architectures. 
     396Because the same assumption is made for reference and IDISA
    402397architectures, determination of the additional circuit
    403398complexity due to IDISA features is unaffected by the
    404399assumption.
    405 
    406 
    407400
    408401In the remainder of this paper, then, IDISA-A and IDISA-B
     
    426419\section{Population Count}
    427420
    428 \begin{figure}[h]
     421\begin{figure}
    429422\begin{center}\small
    430423\begin{verbatim}
     
    512505An inductive doubling algorithm of $n$ steps typically applies
    513506mask or shift operations at each step for each of the
    514 two operands being combined in the step.  If there is
    515 one such operation for each operand, and the combination
    516 can be implemented in a single SWAR operation, then a total
     507two operands being combined in the step.  In general,
     508the mask constants shown in Figure \ref{HD-pop} recur; these
     509are termed ``magic masks'' by Knuth \cite{v4pf1a}.
     510If the algorithm employs a single operation at each step, then a total
    517511of $3n$ operations are the required in a RefB implementation,
    518512and possibly $4n$ for a RefA implementation including the
     
    544538that this is optimal for any three-register instruction set model.
    545539
    546 \begin{figure}[tbh]
     540\begin{figure}
    547541\begin{center}
    548 \includegraphics[width=90mm, trim= 50 50 0 50]{S2P_IO.pdf}
    549 \caption{Input/Output Model for Serial to Parallel Transposition}
     542\includegraphics[width=87mm, trim= 40 50 0 50]{S2P_IO.pdf}
     543\caption{Serial to Parallel Transposition}
    550544\label{s2p-spec}
    551545\end{center}
     
    633627%
    634628
    635 \begin{figure}[tbh]
     629\begin{figure}
    636630\begin{center}\small
    637631\begin{verbatim}
    638 even = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30};
    639 odd = {1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31};
     632even={0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30};
     633odd ={1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31};
    640634mask = simd<8>::constant(0xAA);
    641635t0 = simd<8>::permute(s0, s1, even);
     
    687681dividing the bitpair streams into bit streams in the third stage.
    688682
    689 \begin{figure}[tbh]
    690 \small
    691 \begin{verbatim}
    692 p0 = simd<8>::pack<h,h>(s0, s1);
    693 p1 = simd<8>::pack<l,l>(s0, s1);
    694 \end{verbatim}
    695 \caption{Stage 1 Transposition Step in the Inductive Halving Algorithm}
    696 \label{halvingstep}
    697 \end{figure}
    698 
    699683Figure \ref{halvingstep} shows one step in stage 1 of the inductive
    700684halving algorithm, comprising just two IDISA-A operations.
     
    709693of each nybble and one \verb#simd<4>::pack<l,l># operation to extract the
    710694low pair of each nybble.  Four applications of this step complete stage 2.
     695
     696\begin{figure}
     697\small
     698\begin{verbatim}
     699p0 = simd<8>::pack<h,h>(s0, s1);
     700p1 = simd<8>::pack<l,l>(s0, s1);
     701\end{verbatim}
     702\caption{Step in Inductive Halving Algorithm Stage 1}
     703\label{halvingstep}
     704\end{figure}
     705
    711706
    712707Stage 3 similarly uses four applications of a step that uses a
     
    864859model.
    865860
    866 \begin{figure*}[t]
     861\begin{figure*}
    867862\begin{center}
    868863\begin{tabular}{|c||c|c|c|c|c|c|c|c|}
     
    955950left instruction.  Right justification by shifting an $n$ bit field
    956951$i$ positions to the right is equivalent to a left rotate of $n-i$
    957 positions.  These rotation amounts are computed by the operation
    958 \verb#rj=simd<8>::sub<x,l>(simd<8>::constant(8), cts_4)# as shown in row 5,
     952positions.  Given a register value \verb:c8: preloaded with
     953the value 8 in each 8-bit field, the right rotation
     954amounts are computed by the operation
     955\verb#rj=simd<8>::sub<x,l>(c8, cts_4)# producing values shown in row 5,
    959956except that don't care fields (which won't be subsequently used)
    960957are marked \verb:XX:. 
    961958
    962959The left shift amounts are calculated by \verb#lj=simd<8>::srli<4>(cts_4)#
    963 as shown in row 6, and are combined with the right shift amounts
    964 by the selection operation  \verb#rot_8=simd_if(simd<16>::constant(0xFF00), rj, lj)#
     960producing the values shown in row 6, and are then combined with the right shift amounts
     961by the selection operation  \verb#rot_8=simd_if(mask0xFF00, rj, lj)#
    965962as shown in row 7.  Using these computed values, the inductive step
    966963is completed by application of the operation  \verb#rslt_16=simd<8>::rotl(rslt_8, rot_8)#
     
    10081005% \end{figure}
    10091006
    1010 \begin{figure}[h]
     1007\begin{figure}[tb]
    10111008\begin{center}\small
    10121009\begin{verbatim}
     
    10151012y = simd<8>::xor<h,l>(y, y);
    10161013y = simd<16>::xor<h,l>(y, y);
     1014y = simd<32>::xor<h,l>(y, y);
    10171015\end{verbatim}
    10181016\end{center}
     
    10241022codes such as the various Hamming codes for detecting
    10251023and correcting numbers of bit errors dependent on the
    1026 number of parity bits added.  Whereas
    1027 
     1024number of parity bits added. 
    10281025Figure \ref{ID-parity} shows an IDISA-A parity implementation
    10291026with only 5 operations required for 32-bit fields,
    10301027slightly more than a 2X improvement over the 11 operations
    1031 required in Warren's implementation using shifts  \cite{HackersDelight}.
    1032 
    1033 This represents slightly
    1034 more than a 2X improvement.  The improvement is less than
     1028required in a RefB implementation following Warren
     1029 \cite{HackersDelight}. The improvement is less than
    103510303X seen in other cases because one of the operands need
    10361031not be modified before applying the exclusive-or operation.
     
    10641059
    10651060\subsection{String/Decimal/Integer Conversion}
    1066 \begin{figure}[h]
    1067 \begin{center}\small
    1068 \begin{verbatim}
    1069 b=(d & 0x0F0F0F0F) + 10 * ((d >> 4) & 0x0F0F0F0F);
    1070 b=(d & 0x00FF00FF) + 100 * ((d >> 8) & 0x00FF00FF);
    1071 b=(d & 0x0000FFFF) + 10000 * (d >> 16);
    1072 \end{verbatim}
    1073 \end{center}
    1074 \caption{BCD to Integer Reference Algorithm}
    1075 \label{BCD2int}
    1076 \end{figure}
    1077 
    1078 \begin{figure}[h]
    1079 \begin{center}\small
    1080 \begin{verbatim}
    1081 t1=simd<8>:const(10);
    1082 t2=simd<16>:const(100);
    1083 t3=simd<32>:const(10000);
    1084 b=simd<8>::add<x,l>(simd<8>::mult<h,x>(d,t1), d);
    1085 b=simd<16>::add<x,l>(simd<16>::mult<h,x>(b,t2), b);
    1086 b=simd<32>::add<x,l>(simd<32>::mult<h,x>(b,t3), b);
    1087 \end{verbatim}
    1088 \end{center}
    1089 \caption{IDISA BCD to Integer}
    1090 \label{ID-BCD2int}
    1091 \end{figure}
    10921061
    10931062Just as DNA sequences represent an important use case for
     
    10981067conversion and as a direct representation for
    10991068packed binary coded decimal.
     1069
     1070\begin{figure}
     1071\begin{center}\small
     1072\begin{verbatim}
     1073b=(d & 0x0F0F0F0F) + 10 * ((d >> 4) & 0x0F0F0F0F)
     1074b=(d & 0x00FF00FF) + 100 * ((d >> 8) & 0x00FF00FF)
     1075b=(d & 0x0000FFFF) + 10000 * (d >> 16)
     1076\end{verbatim}
     1077\end{center}
     1078\caption{BCD to Integer Reference Algorithm}
     1079\label{BCD2int}
     1080\end{figure}
     1081
     1082\begin{figure}
     1083\begin{center}\small
     1084\begin{verbatim}
     1085t1=simd<8>:const(10)
     1086t2=simd<16>:const(100)
     1087t3=simd<32>:const(10000)
     1088b=simd<8>::add<x,l>(simd<8>::mult<h,x>(d,t1), d)
     1089b=simd<16>::add<x,l>(simd<16>::mult<h,x>(b,t2), b)
     1090b=simd<32>::add<x,l>(simd<32>::mult<h,x>(b,t3), b)
     1091\end{verbatim}
     1092\end{center}
     1093\caption{IDISA BCD to Integer}
     1094\label{ID-BCD2int}
     1095\end{figure}
    11001096
    11011097Figure \ref{BCD2int} shows a three-step inductive
     
    11321128\subsection{Further Applications}
    11331129
    1134 Further applications of inductive doubling can be
    1135 found in integer contraction and dilation for quadtrees and
     1130
     1131Further applications of IDISA can often be found
     1132by searching for algorithms employing the magic masks
     1133s \verb:0x55555555:,  \verb:0x33333333:, and so on.
     1134Examples include the bitslice implementation of AES \cite{DBLP:conf/cans/RebeiroSD06}
     1135and integer contraction and dilation for quadtrees and
    11361136octtrees\cite{Stocco95} and Morton-ordered arrays \cite{Raman08}.
    11371137Pixel packing from 32 bit fields into a 5:5:5 representation
     
    12121212offsetting the circuit complexity of half-operand
    12131213modifiers with potential elimination of dedicated
    1214 logic for some {/ad hoc} horizontal SWAR operations.
     1214logic for some {\em ad hoc} horizontal SWAR operations.
    12151215Even if legacy support for these operations is required,
    12161216it may be possible to provide that support through
     
    12691269\begin{figure}[tbh]
    12701270\begin{center}
    1271 \includegraphics[width=90mm, trim= 50 350 0 50]{IDISA.pdf}
     1271\includegraphics[width=85mm, trim= 40 350 0 50]{IDISA.pdf}
    12721272\caption{IDISA Block Diagram}
    12731273\label{pipeline-model}
Note: See TracChangeset for help on using the changeset viewer.