source: docs/Balisage09/balislides.tex @ 3937

Last change on this file since 3937 was 281, checked in by cameron, 10 years ago

Balisage Intl. Symp on Processing XML Efficiently - Cameron/Herdy/Amiri? paper

File size: 16.8 KB
Line 
1\documentclass{beamer}
2\usepackage[latin1]{inputenc}
3\usepackage{colortbl}
4\usetheme{Warsaw}
5
6
7\title[International Symposium on Processing XML Efficiently]{Parallel Bit Stream Technology as a Foundation for XML Parsing Performance}
8\author{Rob Cameron, Ken Herdy and Ehsan Amiri}
9\institute{School of Computing Science\\
10Simon Fraser University
11\and
12International Characters, Inc.}
13\date{August 10, 2009}
14\begin{document}
15
16
17
18
19
20
21\begin{frame}
22\titlepage
23\end{frame}
24
25
26\begin{frame}
27\frametitle{Outline}
28\tableofcontents%[pausesections]
29\end{frame}
30
31\section{Introduction}
32\begin{frame}[fragile]
33\frametitle{XML on Commodity Processors}
34
35\begin{itemize}[<+->]
36\item XML appliances and chips have important enterprise applications.
37\item But what about the bulk of the world's XML processing?
38\item Our expectation: commodity Intel/AMD/Power PC chips will
39continue to dominate.
40\item Our work: systematic exploitation of SIMD capabilities
41of these chips for XML processing.
42\item Our innovation: parallel bit stream technology.
43\begin{itemize}
44\item A method to process 128 bytes at a time using 128-bit SIMD registers.
45\end{itemize}
46\item Results: exceptionally promising.
47\end{itemize}
48\end{frame}
49
50
51\begin{frame}[fragile]
52\frametitle{A Transform Representation of Text}
53
54\begin{itemize}[<+->]
55\item Given a byte-oriented character stream $T$.
56\item Transpose to 8 parallel bit streams $b_0$, $b_1$, ..., $b_7$.
57\item Each stream $b_k$ comprises bit $k$ of each byte of $T$.
58\end{itemize}
59\onslide<4->{
60\begin{tabular}{|c|c|c|c|c|c|} \hline
61$T$ & A & b & 1 & 7 & ; \pause \\ \hline
62ASCII & \alert<6>{0}\alert<7>{1}\alert<8>{0}00001 & \alert<6>{0}\alert<7>{1}\alert<8>{1}00010 & \alert<6>{0}\alert<7>{0}\alert<8>{1}10001 & \alert<6>{0}\alert<7>{0}\alert<8>{1}10111 & \alert<6>{0}\alert<7>{0}\alert<8>{1}11011 \pause \\ \hline
63$b_0$ & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} & \alert<6>{0} \pause \\ \hline
64$b_1$ & \alert<7>{1} & \alert<7>{1} & \alert<7>{0} & \alert<7>{0} & \alert<7>{0} \pause \\ \hline
65$b_2$ & \alert<8>{0} & \alert<8>{1} & \alert<8>{1} & \alert<8>{1} & \alert<8>{1} \pause \\ \hline
66$b_3$ & 0 & 0 & 1 & 1 & 1 \\ \hline
67$b_4$ & 0 & 0 & 0 & 0 & 1 \\ \hline
68$b_5$ & 0 & 0 & 0 & 1 & 0 \\ \hline
69$b_6$ & 0 & 1 & 0 & 1 & 1 \\ \hline
70$b_7$ & 1 & 0 & 1 & 1 & 1 \\ \hline
71\end{tabular}
72}
73\end{frame}
74
75\begin{frame}[fragile]
76\frametitle{High Performance Text Processing}
77
78Why form parallel bit streams?
79
80\begin{itemize}[<+->]
81\item Byte-at-a-time text processing is too slow.
82\begin{itemize}[<+->]
83\item Example: XML scan for ``{\tt <}''.
84\item Byte-at-time loop computes only 1 bit per iteration!
85\end{itemize}
86\item So let's compute those bits in parallel!
87\begin{itemize}[<+->]
88\item Bitwise logic on basis streams $b_i \rightarrow${\tt [<]} stream.
89\item Process 128 positions at a time using SSE registers.
90\end{itemize}
91\item Find next ``{\tt <}'' with bit scan instruction (BSF).
92\begin{itemize}
93\item Advance up to 63 positions at once.
94\end{itemize}
95%\item Or use parallel parsing with bitstream addition [new]
96\end{itemize}
97\end{frame}
98
99\begin{frame}[fragile]
100\frametitle{Reported Results}
101
102\begin{block}<+->{PPoPP '08: UTF-8 to UTF-16 transcoding}
103\begin{itemize}[<+->]
104\item iconv: 17.6-23.2 CPU cycles/byte.
105\item u8u16: 0.9 - 6.8 CPU cycles/byte.
106\item Average case speedup: about 10X.
107\end{itemize}
108\end{block}
109
110\begin{block}<+->{CASCON '08: XML statistics application}
111\begin{itemize}[<+->]
112\item Xerces C: 32-143 CPU cycles/byte.
113\item Expat: 14-58 CPU cycles/byte.
114\item Parabix: 5-14 CPU cycles/byte.
115\item 10X fewer L2 data cache misses.
116\item 10X fewer branch mispredictions.
117\end{itemize}
118\end{block}
119
120\end{frame}
121
122\begin{frame}[fragile]
123\frametitle{Reported Results}
124\begin{block}<+->{SVG Open '08: GML to SVG Conversion}
125\begin{itemize}%[<+->]
126\item JAXP/SAX with Xerces-J: 220 CPU cycles/byte.
127\item JAXP/SAX with Crimson: 230 CPU cycles/byte.
128\item JAXP/SAX with Intel XSS: 210 CPU cycles/byte.
129\item JAXP/XSLT with Saxon: 155 CPU cycles/byte.
130\item JAXP/XSLT with Intel XSS: 65 CPU cycles/byte.
131\item C++/SAX with Intel XSS: 25 CPU cycles/byte.
132\item C++/ILAX with Parabix: 15 CPU cycles/byte.
133\end{itemize}
134\end{block}
135
136\end{frame}
137
138\begin{frame}[fragile]
139\frametitle{Performance Prospects}
140\begin{itemize}[<+->]
141\item First generation Parabix has known bottlenecks.
142\begin{itemize}[<+->]
143\item Sequential parsing: 25\% of XML statistics application.
144\item Symbol lookup: 50\% of XML statistics application.
145\end{itemize}
146\item Improved techniques identified in both areas.
147\begin{itemize}[<+->]
148\item Parallel parsing with bitstream addition.
149\item Length-sorted lookup to eliminate loops.
150\end{itemize}
151\item Bit stream parallelism may be leveraged for
152intrachip parallelism on multicore processors.
153\item Processor advances amplify the advantage of
154parallel bit stream methods.
155\end{itemize}
156
157\end{frame}
158
159
160
161\begin{frame}[fragile]
162\frametitle{Parabix 2}
163Current focus of work: Parabix 2 Project.
164\begin{itemize}[<+->]
165\item Full catalog of parallel bitstream techniques.
166\item Parallel parsing with bitstream addition.
167\item Targetting for new/prospective SIMD architectures.
168\begin{itemize}%[<+->]
169\item Intel SSE 4.1, 4.2, AMD SSE 5
170\item Intel AVX (256-bit)
171\item GPGPUs: e.g., Intel Larrabee
172\item Future: pex/pdep, inductive doubling ISA [ASPLOS '09]
173\end{itemize}
174\item Length-sorted fast symbol lookup.
175\item Java performance: Array Set Model.
176\item Compiler technology.
177\begin{itemize}%[<+->]
178\item Program using high-level unbounded bitstream logic.
179\item Compile to low-level SIMD code.
180\item Design for multiple back-ends.
181\end{itemize}
182\end{itemize}
183
184\end{frame}
185
186
187\section{Catalog of XML Bit Streams}
188
189\begin{frame}[fragile]
190\frametitle{Bit Stream Types}
191
192Our catalog of XML bit streams shows how bit streams
193can be used in many ways.
194
195\begin{itemize}%[<+->]
196
197\item Basis bit streams.
198
199\item Character class bit streams.
200
201\item Scope streams.
202
203\item Multiliteral bit streams.
204
205\item Lexical item streams.
206
207\item Error streams.
208
209\item Deletion mask streams.
210
211\item Cursor Streams.
212
213\item Call-out Streams.
214\end{itemize}
215
216
217\end{frame}
218
219
220\section{Parallel Parsing with Bitstream Addition}
221
222
223
224
225\begin{frame}[fragile]
226\frametitle{Parallel Scanning Basics}
227\begin{itemize}[<+->]
228\item<2-> Given a text with numeric references, $T$ \only<3->{(\alert<3>{little-endian}) }
229\item<4-> Compute the digits character class stream, $D$.
230\item<5-> Initialize cursors $C_0$ using the {\tt '\&\#'} stream \only<6->{ \alert<6>{shifted}}.
231\item<7-> Parallel scan of numeric references with bitstream addition.
232\item<8-> Mask off the garbage.
233\item<9-> Call out the extracted numeric references.
234\item<10-> Find unterminated reference errors.
235\end{itemize}
236\begin{tabular}{rl}
237\onslide<2->$T$   &  {\alt<2>{\tt 12 \&\#0013;\&\#10:  deed 3443 \&\#10345;  (\&\#8;) }{\tt );8\#\&(  ;54301\#\& 3443 deed :01\#\&;3100\#\& 21}}\\
238\onslide<4->$D$     &  {\tt     ..1.....11111...1111.......11...1111...11} \\
239\onslide<5->$C_0$     &  {\alt<5>{\tt  ...1.........1...............1......1....} {\tt  ..1.........1...............1......1.....} }\\
240\onslide<7->$C_1 = C_0 +D$ & {\tt .10....100000...1111......100..10000...11} \\
241\onslide<8->$C_2 = C_1
242  \wedge \neg D$ & {\tt .10....100000.............100..10000.....} \\
243\onslide<9->$R = C_2 - C_0$ & {\tt ..1.....11111..............11...1111.....} \\
244\onslide<10->$E = C_2 \wedge \neg {\tt [;]}$ & {\tt ..........................1..............} \\
245
246\end{tabular}
247
248\end{frame}
249
250\begin{frame}[fragile]
251\frametitle{Parallel Parsing of XML Tags}
252\begin{block}<+->{XML Start Tag Syntax}
253\begin{semiverbatim}
254'<' Name (S Name S? '=' S?
255    (('"' [^<"]* '"') | ("'" [^<']* "'")))* S? '>'
256\end{semiverbatim}
257\end{block}
258
259\begin{itemize}[<+->]
260\item Initialize cursors with with [{\tt <}] bitstream.
261\item Apply bitstream addition techniques.
262\item A loop sequentially handles attribute/value pairs
263within tags.
264\item But all tags are processed in parallel!
265\item Max iteration count is max \# of attributes in any one tag.
266\item Block-by-block processing: iterate to max atts for block.
267\item Bit stream logic checks for all tag syntax errors in parallel.
268\item Logic is fully implemented within Parabix 2 prototype.
269\end{itemize}
270
271\end{frame}
272
273
274\begin{frame}[fragile]
275\frametitle{Parsing Results: Call-Out Streams}
276\small
277\begin{tabular}{rl}
278Document & {\tt      <top x='1'><a><leaf>4</leaf>;<void/></a></top>} \pause \\
279NamePosns & {\tt       .1..........1..1..............1...............}  \\
280NameFollows & {\tt         ....1........1.....1..............1...........}  \\
281Names & {\tt         .111........1..1111...........1111............} \pause \\
282AttNames    & {\tt           .....1........................................} \\
283AttVals     & {\tt            .......111....................................} \pause \\
284Tags       & {\tt             .111111111..1..1111...........11111...........} \pause \\
285EmptyTagMarks:    & {\tt       ...................................1..........} \pause \\
286EndTags        & {\tt         ......................11111..........11..1111.} \pause \\
287%STagEnds & {\tt                ..........1..1.....1..........................} \pause \\
288%EmptyEnds & {\tt            ...................................1..........} \pause \\
289%AttStarts & {\tt           .....1........................................} \pause \\
290%AttFollows & {\tt          ......1.......................................} \pause \\
291%AttVals & {\tt            .......1......................................} \pause \\
292%AttValEnds & {\tt              .........1....................................} \pause \\
293%EndTags & {\tt           ......................1..............1...1....} \pause \\
294%EndTagEnds & {\tt              ...........................1...........1.....1} \pause \\
295ParseError & {\tt              ...............................................} \pause \\
296\end{tabular}
297\end{frame}
298
299\section{Efficient XML in Java with Array Set Models}
300
301\begin{frame}[fragile]
302\frametitle{The Java Performance Challenge}
303\begin{itemize}[<+->]
304\item Java has no facilities for direct use of SIMD.
305\item Java's built-in UTF-8 to UTF-16 transcoding is slow.
306\item The Java Native Interface (JNI) provides access to C, but:
307\begin{itemize}[<+->]
308\item Java and C/C++ data objects may be incompatible.
309\item Even string representations are incompatible.
310\item JNI calls are expensive, hundreds of CPU cycles.
311\end{itemize}
312\item But Java is an important and popular technology for XML processing.
313\item High performance solutions must be found.
314\end{itemize}
315\end{frame}
316
317
318\begin{frame}[fragile]
319\frametitle{Array Set Models}
320\begin{itemize}[<+->]
321\item JNI allows bulk import of arrays of simple types (bytes, integers).
322\item Therefore model XML data using sets of such arrays.
323\item Transport array data across JNI boundary in bulk.
324\item Array set models may also have other benefits.
325\begin{itemize}%[<+->]
326\item Hardware/software prefetching.
327\item DMA (direct memory access) hardware.
328\item SIMD: Use SoA (structure of array) representations.
329\item Multicore processors: array models support data partitioning.
330\item Streaming buffers for large XML documents.
331\end{itemize}
332\end{itemize}
333\end{frame}
334
335\begin{frame}[fragile]
336\frametitle{Initial Study: Saxon-B Data Structures}
337\begin{itemize}[<+->]
338\item An initial study of the ASM concept was to reimplement Saxon-B
339data structures.
340\item Two basic structures: TinyTree and NamePool.
341\item TinyTree uses several arrays of integers.
342\begin{itemize}%[<+->]
343\item Node kind, name code, depth and next sibling arrays.
344\item Other arrays dependent on type.
345\end{itemize}
346\item NamePools are collections of namespace prefix, URI, local name triples.
347\item Reimplementation using Parabix/JNI.
348\begin{itemize}[<+->]
349\item Copying re-implementation: 2X faster build time.
350\item Using direct memory byte buffers: 2.5X improvement.
351\end{itemize}
352
353\end{itemize}
354\end{frame}
355
356
357
358
359
360
361\section{Compiler Technology}
362
363\begin{frame}[fragile]
364\frametitle{Automated Generation of Bit Stream Code}
365\begin{itemize}[<+->]
366\item Eliminate tedious, error prone manual coding.
367\item Apply code generation algorithms.
368\begin{itemize}%[<+->]
369\item register allocation
370\item rematerialization
371\item instruction scheduling
372\end{itemize}
373\item Retarget for various instruction set architectures.
374\item Retarget for different memory models.
375\item Adapt for multicore.
376\end{itemize}
377\end{frame}
378
379\begin{frame}[fragile]
380\frametitle{Character Class Compiler}
381\begin{itemize}[<+->]
382\item Use character class definitions
383\begin{itemize}[<+->]
384\item individual characters
385\begin{itemize}
386\item compile({\tt [CharDef(LAngle, "<")]})
387\end{itemize}
388\item character ranges
389\begin{itemize}
390\item compile({\tt [CharSet('Control', ['$\backslash$x00-$\backslash$x1F']),
391           CharSet('Digit', ['0-9']})]])
392\end{itemize}
393
394\end{itemize}
395\item Generates bit stream code automatically.
396\item Applies various optimizations.
397\item Used in first-generation Parabix.
398\end{itemize}
399\end{frame}
400
401
402
403\begin{frame}[fragile]
404\frametitle{Character Class Formation}
405\begin{itemize}[<+->]
406\item Combining 8 bits of a code unit gives a character class stream.
407\item 7 bitwise logical operations required.
408\item compile({\tt [CharDef(LAngle, "<")]})
409\item 
410\begin{semiverbatim}
411temp1 = simd_or(bit[0], bit[1]);
412temp2 = simd_and(bit[2], bit[3]);
413temp3 = simd_andc(temp2, temp1);
414temp4 = simd_and(bit[4], bit[5]);
415temp5 = simd_or(bit[6], bit[7]);
416temp6 = simd_andc(temp4, temp5);
417LAngle = simd_and(temp3, temp6);
418\end{semiverbatim}
419\end{itemize}
420
421\end{frame}
422
423
424\begin{frame}[fragile]
425\frametitle{Character Class Common Subexpressions}
426\begin{itemize}[<+->]
427\item Different characters have common subexpressions.
428\item compile({\tt [CharDef(LAngle, "<"), \\ CharDef(RAngle, "<")]})
429\item 
430\begin{semiverbatim}
431temp1 = simd_or(bit[0], bit[1]);
432temp2 = simd_and(bit[2], bit[3]);
433temp3 = simd_andc(temp2, temp1);
434temp4 = simd_and(bit[4], bit[5]);
435temp5 = simd_or(bit[6], bit[7]);
436temp6 = simd_andc(temp4, temp5);
437LAngle = simd_and(temp3, temp6);
438\onslide<4->temp7 = simd_andc(bit[6], bit[7]);
439\onslide<4->temp8 = simd_and(temp4, temp7);
440\onslide<4->RAngle = simd_and(temp3, temp8);
441\end{semiverbatim}
442\end{itemize}
443\end{frame}
444
445\begin{frame}[fragile]
446\frametitle{Character Class Ranges}
447\begin{itemize}[<+->]
448\item Ranges of characters are often very simple to compute.
449\item compile({\tt [CharSet('Control', ['$\backslash$x00-$\backslash$x1F']),
450           \onslide<4->{CharSet('Digit', ['0-9']})]]})
451\item 
452\begin{semiverbatim}
453temp1 = simd_or(bit[0], bit[1]);
454temp2 = simd_or(temp1, bit[2]);
455Control = simd_andc(simd_const_1(1), temp2)
456\onslide<5->temp3 = simd_and(bit[2], bit[3]);
457\onslide<5->temp4 = simd_andc(temp3, temp1);
458\onslide<5->temp5 = simd_or(bit[5], bit[6]);
459\onslide<5->temp6 = simd_and(bit[4], temp5);
460\onslide<5->Digit = simd_andc(temp4, temp6);
461\end{semiverbatim}
462\end{itemize}
463\end{frame}
464
465
466\begin{frame}[fragile]
467\frametitle{Regular Expression Compilation}
468\begin{itemize}[<+->]
469\item Extend character class compiler for regexp operations.
470\item Use bitstream addition for character class repetitions.
471\item Ex. [0-9]*
472\item Masking for upper or lower bounds: [0-9]\{2,5\}
473\item Restrict to a deterministic RE subset.
474\item Under development.
475\end{itemize}
476\end{frame}
477
478\begin{frame}[fragile]
479\frametitle{Unbounded Bit Stream Compilation}
480\begin{itemize}[<+->]
481\item Recent prototypes use unbounded bitstreams in Python.
482\item Ex: catalog of XML bit streams.
483\item Manual coding then used to reimplement in C++.
484\item But, can this be automated?
485\item Yes!
486\begin{itemize}[<+->]
487\item Define restricted form of Python bitsream operations.
488\item Translate to SIMD operations using C/C++ intrinsics.
489\item Compiler handles segmentation into blocks or segments.
490\item Work in progress.
491\end{itemize}
492\end{itemize}
493\end{frame}
494
495
496\section{Conclusion}
497
498\begin{frame}[fragile]
499\frametitle{Concluding Remarks}
500\begin{itemize}[<+->]
501\item Parallel bit stream methods accelerate
502\begin{itemize}%[<+->]
503\item UTF-8 and XML character validation.
504\item Transcoding.
505\item Whitespace and line break handling.
506\item Sequential parsing using bit scans over lexical item streams.
507\item Parallel tag parsing using bitstream addition.
508\end{itemize}
509\item Methods may be used to support any XML API.
510\item Results can be delivered to Java through Array Set Models.
511\item Compiler technology promises to ease the
512programming burden and support retargetting for various
513architectures.
514\end{itemize}
515\end{frame}
516
517\begin{frame}[fragile]
518\begin{block}{Knuth's Final Exercise on Bitwise Techniques}
519{\bf 217.}  {\it [40]} Explore the processing of long strings of text by packing them in a ``transposed'' or ``sliced'' manner: Represent 64 consecutive characters as a sequence of eight octabytes $w_0 \ldots w_7$ where $w_k$ contains all 64 of their $k$th bits.
520
521\begin{itemize}
522\item Donald E. Knuth, The Art of Computer Programming, Volume 4, Fascicle 1, Addison-Wesley, Boston MA, 2009.
523\end{itemize}
524\end{block}
525
526\begin{block}{Knuth's Answer}
527{\bf 217.}  See R. D. Cameron, {\it U.S. Patent 7400271} (15 July 2008);
528{\it Proc. ACM Symp. Principles and Practice of Parallel Programming} {\bf 12} (2008), 91--98.
529\end{block}
530
531\end{frame}
532
533\end{document}
Note: See TracBrowser for help on using the repository browser.