source: proto/parabix2/parabix2_compilable.py @ 547

Last change on this file since 547 was 547, checked in by cameron, 9 years ago

Inline refs and use if optimization

File size: 34.0 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# parabix2.py
4#
5# Parallel XML Parsing with Bitstream Addition
6# - Complete prototype for all bitstream computations in Parabix2
7#
8# Robert D. Cameron
9# August 20, 2009
10#
11#----------------------------------------------------------------------------
12#
13# We use python's unlimited precision integers for unbounded bit streams.
14# This permits simple logical operations on the entire stream.
15# Assumption: bitstreams are little-endian (e.g., as on x86).
16#
17#----------------------------------------------------------------------------
18#
19
20
21#import bitutil
22
23import byteclass
24
25import u8u16
26
27#import sys
28
29def validate_xmlchar(u8, control, lex, EOF_mask):
30        r"""Compute an error stream marking characters illegal in XML:
31        (1) Control characters in the range 0x00-0x1F except HT, LF, CR
32        (2) OxFFFF and OxFFFE, having UTF-8 encodings 0xEF 0xBF 0XBF and 0xEF 0xBF 0xBE.
33
34        >>> demo_validate_xmlchar('plaintext (good: \x09) (bad: \x03) (bad \xEF\xBF\xBF) (good \xEF\xBF\xBC)')
35        input high nybbles: 7666676772266663202226663202226662ebb22266662ebb2
36        input low nybbles : 0c19e4584087ff4a09908214a039082140fff9087ff40ffc9
37        illegal XML chars : __________________________1_________1_____________
38"""
39        EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
40        ret = (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
41        return ret
42
43
44def demo_validate_xmlchar(u8data):
45        lgth = len(u8data)
46        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
47        (u8, control, lex) = byteclass.classify_bytes(bit)
48        bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)), 
49                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
50                              ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex, EOF_mask), lgth+1))])
51
52def normalize_line_breaks(control, bit):
53        r"""Convert CRs to LFs and mark CRLF occurrences for deletion.
54
55        >>> demo_line_breaks('ab \r\n  cd \r  ef \r ')
56        input high nybbles: 662002266202266202
57        input low nybbles : 120da00340d00560d0
58        CR                : ___1______1_____1_
59        LF                : ____1_____________
60        CRLF              : ____1_____________
61"""
62        control.CRLF = control.CR_scope & control.LF
63        # Convert CRs to LFs (flip bits 5, 6 and 7 with xor).
64        bit[5] ^= control.CR
65        bit[6] ^= control.CR
66        bit[7] ^= control.CR
67        return (control, bit)
68
69def demo_line_breaks(u8data):
70        lgth = len(u8data)
71        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
72        (u8, control, lex) = byteclass.classify_bytes(bit)
73        (control, bit) = normalize_line_breaks(control, bit)
74        bitutil.print_aligned_u8_byte_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)), 
75                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
76                              ('CR', bitutil.bitstream2string(control.CR, lgth)),
77                              ('LF', bitutil.bitstream2string(control.LF, lgth)),
78                              ('CRLF', bitutil.bitstream2string(control.CRLF, lgth))])
79
80
81
82
83
84def add_multiliterals(lex):
85        """Extend the byte-based lexical item streams for some important
86        multibyte literals.
87       
88        >>> demo_multiliterals("  <?php?>  <!--  -->  <![CDATA[  ]]> ")
89        input data  :   <?php?>  <!--  -->  <![CDATA[  ]]>
90        PI_start    : ___1_________________________________
91        CtCD_start  : ____________1__________1_____________
92        EndTag_start: _____________________________________
93        CD_end      : ___________________________________1_
94        DoubleHyphen: ______________1___1__________________
95        PI_end      : ________1____________________________
96        """
97
98        lex.PI_start = lex.LAngle_scope & lex.QMark
99        lex.CtCD_start = lex.LAngle_scope & lex.Exclam
100        lex.EndTag_start = lex.LAngle_scope & lex.Slash
101        lex.CD_end = bitutil.Advance(lex.RBracket_scope & lex.RBracket) & lex.RAngle
102        lex.DoubleHyphen = lex.Hyphen_scope & lex.Hyphen
103        lex.PI_end = lex.QMark_scope & lex.RAngle
104        return lex
105
106def demo_multiliterals(u8data):
107        lgth = len(u8data)
108        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
109        (u8, control, lex) = byteclass.classify_bytes(bit)
110        lex = add_multiliterals(lex)
111        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
112                              ('PI_start', bitutil.bitstream2string(lex.PI_start, lgth)),
113                              ('CtCD_start', bitutil.bitstream2string(lex.CtCD_start, lgth)),
114                              ('EndTag_start', bitutil.bitstream2string(lex.EndTag_start, lgth)),
115                              ('CD_end', bitutil.bitstream2string(lex.CD_end, lgth)),
116                              ('DoubleHyphen', bitutil.bitstream2string(lex.DoubleHyphen, lgth)),
117                              ('PI_end', bitutil.bitstream2string(lex.PI_end, lgth))])
118
119class CtCDPI_callouts:
120        CD_span = 0
121        Ct_span = 0
122        PI_mask = 0
123        CtCDPI_mask = 0
124        error = 0
125       
126def parse_CtCDPI(lex, EOF_mask):
127        """Parse all comments, CDATA sections and processing instructions.
128       
129        Return bitstreams marking the extent of these markup items,
130        excluding initial and final bracketting.
131       
132        >>> demo_CtCDPI(' <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]> ')
133        input data :  <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]>
134        CD_span    : ______________________________11111111111111111111111__
135        Ct_span    : _____________111111111111______________________________
136        PI_span    : __11111________________________________________________
137        CtCDPI_mask: __111111___111111111111111___1111111111111111111111111_
138        error      : ________________________________________________________
139       
140        Comments are terminated by double-hyphen; immediately require closing ">".
141       
142        >>> demo_CtCDPI(' <!--  <?php?>  --   <!-- -->')
143        input data :  <!--  <?php?>  --   <!-- -->
144        CD_span    : _____________________________
145        Ct_span    : ____11111111111111______1111_
146        PI_span    : _____________________________
147        CtCDPI_mask: __11111111111111111___1111111
148        error      : __________________1___________
149
150
151
152"""
153        callouts = CtCDPI_callouts()
154        PI_starts = 0
155        PI_ends = 0
156        Ct_starts = 0
157        Ct_ends = 0
158        CD_starts = 0
159        CD_ends = 0
160        CtCDPI_starts = 0
161        # Scanning streams
162        CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
163        Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
164        CD_end_scan = ~lex.CD_end & EOF_mask
165        PI_end_scan = ~lex.PI_end & EOF_mask
166        #
167        # Initiate the scan
168        CtCDPI_Cursor = 1
169        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
170        CtCDPI_Cursor &= EOF_mask
171        while CtCDPI_Cursor:
172                CtCDPI_starts |= CtCDPI_Cursor
173                PI_Cursor = CtCDPI_Cursor & lex.PI_start
174                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
175                CD_Cursor = CD_Ct_Cursor & lex.LBracket
176                Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
177                PI_starts |= PI_Cursor
178                CD_starts |= CD_Cursor
179                Ct_starts |= Ct_Cursor
180                Ct_Cursor = bitutil.Advance(Ct_Cursor)
181                Ct_end_scan |= Ct_Cursor
182                PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
183                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
184                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
185                PI_ends |= PI_Cursor
186                CD_ends |= CD_Cursor
187                Ct_ends |= Ct_Cursor
188                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
189                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
190                CtCDPI_Cursor &= EOF_mask
191        # End of loop: no remaining CtCDPI_Cursor
192        callouts.CD_span = CD_ends - CD_starts
193        callouts.Ct_span = Ct_ends - Ct_starts
194        callouts.PI_span = PI_ends - PI_starts
195       
196        callouts.CtCDPI_mask |= bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
197        callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
198        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
199        callouts.error |= callouts.CtCDPI_mask &~ EOF_mask
200        return callouts
201
202def demo_CtCDPI(u8data):
203        lgth = len(u8data)
204        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
205        (u8, control, lex) = byteclass.classify_bytes(bit)
206        lex = add_multiliterals(lex)
207        markup = parse_CtCDPI(lex, EOF_mask)
208        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
209                              ('CD_span', bitutil.bitstream2string(markup.CD_span, lgth)),
210                              ('Ct_span', bitutil.bitstream2string(markup.Ct_span, lgth)),
211                              ('PI_span', bitutil.bitstream2string(markup.PI_span, lgth)),
212                              ('CtCDPI_mask', bitutil.bitstream2string(markup.CtCDPI_mask, lgth)),
213                              ('error', bitutil.bitstream2string(markup.error, lgth+1))])
214
215
216class ref_callouts:
217        GenRefs = 0
218        DecRefs = 0
219        HexRefs = 0
220        delmask = 0
221        error = 0
222
223def parse_refs(lex, CtCDPI_mask):
224        """Parse and call out all general and character references.
225        Mark all but the closing semicolon for deletion.
226       
227        >>> demo_refs(" &gt;  &#13;  &#x0a;  ")
228        input data       :  &gt;  &#13;  &#x0a; 
229        entity refs      : __11__________________
230        decimal char refs: _________11___________
231        hex char refs    : _________________11___
232        ref delmask      : _111___1111___11111___
233        errors           : _______________________
234
235        Empty numeric references are reported as errors.
236        >>> demo_refs(" &#;       &#x; ")
237        input data       :  &#;       &#x;
238        entity refs      : ________________
239        decimal char refs: ________________
240        hex char refs    : ________________
241        ref delmask      : _11________111__
242        errors           : ___1__________1__
243
244        Improperly terminated or unterminated references (lacking ";") are also errors.
245        >>> demo_refs("  &gt:  &#456a;  &#xab:  &unterminated")
246        input data       :   &gt:  &#456a;  &#xab:  &unterminated
247        entity refs      : ___111____________________111111111111
248        decimal char refs: __________111_________________________
249        hex char refs    : ____________________11________________
250        ref delmask      : __1111__11111____11111___1111111111111
251        errors           : ______1______1________1_______________1
252"""
253        CallOuts = ref_callouts()
254        Ref2 = lex.RefStart_scope &~ CtCDPI_mask
255        NumRef2 = Ref2 & lex.Hash
256        GenRef2 = Ref2 &~ lex.Hash
257        NumRef3 = bitutil.Advance(NumRef2)
258        HexRef3 = NumRef3 & lex.x
259        DecRef3 = NumRef3 &~ lex.x
260        HexRef4 = bitutil.Advance(HexRef3) 
261        GenRefEnds = bitutil.ScanThru(GenRef2, lex.NameScan)
262        DecRefEnds = bitutil.ScanThru(DecRef3, lex.Digit)
263        HexRefEnds = bitutil.ScanThru(HexRef4, lex.Hex)
264        # Error checks
265        # At least one digit required for DecRef, one hex digit for HexRef.
266        error1 = DecRef3 &~ lex.Digit
267        error2 = HexRef4 &~ lex.Hex
268        # Semicolon terminator required (also covers unterminated at EOF).
269        error3 = (GenRefEnds | DecRefEnds | HexRefEnds) &~ lex.Semicolon
270        CallOuts.GenRefs = GenRefEnds - GenRef2
271        CallOuts.DecRefs = DecRefEnds - DecRef3
272        CallOuts.HexRefs = HexRefEnds - HexRef4
273        # Mark references for deletion, but leave the trailing semicolon as
274        # the point for insertion of the "expansion" text (most often a
275        # single character).
276        CallOuts.delmask = (GenRefEnds | DecRefEnds | HexRefEnds) - lex.RefStart
277        CallOuts.error = error1 | error2 | error3
278        return CallOuts
279
280def demo_refs(u8data):
281        lgth = len(u8data)
282        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
283        (u8, control, lex) = byteclass.classify_bytes(bit)
284        callouts = parse_refs(lex, 0)
285        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
286                              ('entity refs', bitutil.bitstream2string(callouts.GenRefs, lgth)),
287                              ('decimal char refs', bitutil.bitstream2string(callouts.DecRefs, lgth)),
288                              ('hex char refs', bitutil.bitstream2string(callouts.HexRefs, lgth)),
289                              ('ref delmask', bitutil.bitstream2string(callouts.delmask, lgth)),
290                              ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
291
292
293class tag_callouts:
294        ElemNames = 0
295        AttNames = 0
296        AttVals = 0
297        Tags = 0
298        EmptyTagEnds = 0
299        EndTags = 0
300        error = 0
301       
302        # POTENTIAL ADDITIONAL FIELDS
303        # StartTagEnds = 0
304        # EmptyTagEnds = 0     
305        # EndTagEnds = 0
306
307def parse_tags(lex, CtCDPI_mask, EOF_mask):
308        """Parse start, empty and end tags, calling out element names, attribute
309        names and values, empty tag positions, and tag extents.
310
311        >>> demo_tags("<root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>")
312        input data      : <root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>
313        element names   : _1111__11___________11_______________________________1111__________________
314        attribute names : _______________________11_______11________________________1111_____________
315        attribute values: __________________________11111______11111_____________________111_________
316        empty tag marks : ___________________________________________________________________1_______
317        end tags        : _______________111______________________________111__________________11111_
318        start/empty tags: _1111__11___________1111111111111111111111___________11111111111111________
319        errors          : ____________________________________________________________________________
320
321        Attributes can use double quotes.
322
323        >>> demo_tags('<dquote_atts a1="1234" attribute2="4321"/>')
324        input data      : <dquote_atts a1="1234" attribute2="4321"/>
325        element names   : _11111111111______________________________
326        attribute names : _____________11________1111111111_________
327        attribute values: ________________111111____________111111__
328        empty tag marks : _________________________________________1
329        end tags        : __________________________________________
330        start/empty tags: _1111111111111111111111111111111111111111_
331        errors          : ___________________________________________
332
333        Syntax errors of various types are identified with the error stream.
334
335        1. Element name missing errors.
336
337        >>> demo_tags("< noname='flawed'/> ")
338        input data      : < noname='flawed'/>
339        element names   : ____________________
340        attribute names : __111111____________
341        attribute values: _________11111111___
342        empty tag marks : __________________1_
343        end tags        : ____________________
344        start/empty tags: _11111111111111111__
345        errors          : _1___________________
346
347        2. Missing attribute names.
348
349        >>> demo_tags("<noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>")
350        input data      : <noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>
351        element names   : _11111_______________1111111___________________________________
352        attribute names : _____________________________11________________________________
353        attribute values: ________11111111________________111111___11111_________________
354        empty tag marks : _________________1_____________________________________________
355        end tags        : ______________________________________________________11111111_
356        start/empty tags: _1111111111111111____1111111111111111111111111_________________
357        errors          : _______1_______________________________1________________________
358
359        3. Missing or incorrect = sign.
360
361        >>> demo_tags('<errata plusforeq+"5678" noequals"90" />')
362        input data      : <errata plusforeq+"5678" noequals"90" />
363        element names   : _111111_________________________________
364        attribute names : ________111111111________11111111_______
365        attribute values: __________________111111__________111111
366        empty tag marks : ________________________________________
367        end tags        : ________________________________________
368        start/empty tags: _111111111111111111111111111111111111111
369        errors          : _________________1_______________11______
370
371        4.  Missing whitespace
372
373        >>> demo_tags("<jammed att='value'att2='v2' />")
374        input data      : <jammed att='value'att2='v2' />
375        element names   : _111111________________________
376        attribute names : ________111________1111________
377        attribute values: ____________1111111_____1111___
378        empty tag marks : ______________________________1
379        end tags        : _______________________________
380        start/empty tags: _11111111111111111111111111111_
381        errors          : ___________________1____________
382
383        5.  Extra whitespace in an empty tag.
384
385        >>> demo_tags("<extrawhite / >")
386        input data      : <extrawhite / >
387        element names   : _1111111111____
388        attribute names : _______________
389        attribute values: _______________
390        empty tag marks : _____________1_
391        end tags        : _______________
392        start/empty tags: _111111111111__
393        errors          : _____________1__
394
395        6.  Unterminated or incorrectly terminated attribute values
396
397        >>> demo_tags("<badattvalues a='blud<   b='455>   ")
398        input data      : <badattvalues a='blud<   b='455>   
399        element names   : _111111111111______________________
400        attribute names : ______________1__________1_________
401        attribute values: ________________111111_____11111111
402        empty tag marks : ___________________________________
403        end tags        : ___________________________________
404        start/empty tags: _111111111111111111111_111111111111
405        errors          : _____________________11____________1
406
407        7.  Unterminated tags
408
409        >>> demo_tags("<unterminated a='245'  ")
410        input data      : <unterminated a='245' 
411        element names   : _111111111111__________
412        attribute names : ______________1________
413        attribute values: ________________11111__
414        empty tag marks : _______________________
415        end tags        : _______________________
416        start/empty tags: _1111111111111111111111
417        errors          : _______________________1
418
419"""
420        callouts = tag_callouts()
421       
422        # Delimiters for scans.
423        DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
424        SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
425        AttListDelim = lex.Slash | lex.RAngle
426       
427        # Start the parallel parsing by inspecting the character
428        # after the opening "<" of a tag.
429        LAngleFollow = lex.LAngle_scope &~ CtCDPI_mask
430        ElemNamePositions = LAngleFollow & ~lex.Slash
431        EndTagSeconds = LAngleFollow & lex.Slash
432       
433        # Start Tag/Empty Element Tag Parsing
434
435        # Advance all cursors by scanning through the tag name.
436        ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
437        # Must have at least one name character for a legal start tag.
438        # Mark any occurrences of null names as errors.
439        ParseError = ElemNamePositions & ElemNameFollows
440        callouts.ElemNames = ElemNameFollows - ElemNamePositions
441       
442        # Initialize the accumulators for attribute name and value positions.
443        AttNameStarts = 0 
444        AttNameFollows = 0
445        EqToCheck = 0
446        AttValStarts = 0
447        AttValEnds = 0
448        AttValFollows = 0
449
450        # After the element name, there may or may not be an attlist.
451        AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
452        AttListEnd = AfterWS & AttListDelim
453        AttNameStart = AfterWS & ~AttListDelim
454        # At least one WS character is required between ElemNames and AttNames.
455        ParseError |= ElemNameFollows & AttNameStart
456
457        #
458        # The following loop iterates through attributes within a start tag.
459        # Because all start tags are processed in parallel, the number of
460        # iterations is the maximum number of attributes found in any one
461        # start tag, plus one.
462        while AttNameStart:
463                AttNameStarts |= AttNameStart
464                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
465                AttNameFollows |= AttNameFollow
466                # Scan through WS to the expected '=' delimiter.
467                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
468                EqToCheck |= EqExpected
469                AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
470                AttValStarts |= AttValPos
471                DQuoteAttVal = AttValPos & lex.DQuote
472                SQuoteAttVal = AttValPos & lex.SQuote
473                DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
474                SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
475                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
476                AttValEnds |= AttValEnd
477                AttValFollow = bitutil.Advance(AttValEnd)
478                AttValFollows |= AttValFollow
479                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
480                AttListEnd |= AfterWS & AttListDelim
481                AttNameStart = AfterWS & ~AttListDelim
482
483        # No more attribute values to process when AttNameStart == 0.
484
485        callouts.AttNames = AttNameFollows - AttNameStarts
486        callouts.AttVals = AttValFollows - AttValStarts
487        STagEnds = AttListEnd & lex.RAngle
488        # Mark any "/" characters found as the ends of empty element tags.
489        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
490        callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
491
492        # Check for errors.
493        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
494        ParseError |= AttNameStarts & AttNameFollows # Null AttName
495        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
496        ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
497        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
498        ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
499
500        # End Tag Parsing
501        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
502        ParseError |= EndTagEnds & ~lex.RAngle
503        callouts.EndTags = EndTagEnds - EndTagSeconds
504        callouts.error = ParseError
505
506        # POTENTIAL ADDITIONAL FIELDS
507        # callouts.StartTagEnds = STagEnds
508        # callouts.EmptyTagEnds = bitutil.Advance(callouts.EmptyTagMarks)
509        # callouts.EndTagEnds = EndTagEnds
510       
511        return callouts
512
513def demo_tags(u8data):
514        lgth = len(u8data)
515        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
516        (u8, control, lex) = byteclass.classify_bytes(bit)
517        lex = add_multiliterals(lex)
518        markup1 = parse_CtCDPI(lex, EOF_mask)
519        callouts = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
520        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
521                              ('element names', bitutil.bitstream2string(callouts.ElemNames, lgth)),
522                              ('attribute names', bitutil.bitstream2string(callouts.AttNames, lgth)),
523                              ('attribute values', bitutil.bitstream2string(callouts.AttVals, lgth)),
524                              ('empty tag marks', bitutil.bitstream2string(callouts.EmptyTagMarks, lgth)),
525                              ('end tags', bitutil.bitstream2string(callouts.EndTags, lgth)),
526                              ('start/empty tags', bitutil.bitstream2string(callouts.Tags, lgth)),
527                              ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
528
529
530
531def validate_no_CD_end(lex, markup1, tags):
532        """Find illegal occurrences of ]]> in text (outside of markup).
533
534        >>> demo_validate_no_CD_end(' <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?> ')
535        input data :  <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?>
536        CtCDPI_mask: __1111111111111111__111111111111111_______________________________11111111111_
537        tags       : __________________________________________1111111111111111____________________
538        illegal ]]>: _______________________________________1______________________1_______________
539"""
540        ret = lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
541        return ret
542
543def demo_validate_no_CD_end(u8data):
544        lgth = len(u8data)
545        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
546        (u8, control, lex) = byteclass.classify_bytes(bit)
547        lex = add_multiliterals(lex)
548        markup1 = parse_CtCDPI(lex, EOF_mask)
549        tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
550        error = validate_no_CD_end(lex, markup1, tags)
551        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
552                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
553                              ('tags', bitutil.bitstream2string(tags.Tags, lgth)),
554                              ('illegal ]]>', bitutil.bitstream2string(error, lgth))])
555
556
557
558def main(u8data):
559        # Transpose to parallel bit streams and prepare an EOF mask.
560        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
561
562        # Classify bytes for UTF-8 processing, whitespace and control
563        # processing and XML lexical analysis.
564        (u8, control, lex) = byteclass.classify_bytes(bit)
565        #(u8, control, lex) = byteclass.classify_bytes_with_shift1opt(bit)
566
567        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
568        u8 = u8u16.validate_utf8(u8)
569
570        # Rule out the illegal characters for XML.
571        xmlchar_error = validate_xmlchar(u8, control, lex, EOF_mask)
572
573        # Find and normalize bare CR or CRLF combinations.
574        # Not needed for xmlwf
575        #(control, bit) = normalize_line_breaks(control, bit)
576
577        # Compute XML multilterals such as <?, </, --, ]]>.
578        lex = add_multiliterals(lex)
579       
580       
581        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
582        # Parse all comments, CDATA sections and processing instructions.
583        #markup1 = parse_CtCDPI(lex, EOF_mask)
584        #CT_callouts = CtCDPI_callouts()
585        PI_starts = 0
586        PI_ends = 0
587        Ct_starts = 0
588        Ct_ends = 0
589        CD_starts = 0
590        CD_ends = 0
591        CtCDPI_starts = 0
592        PI_name_ends = 0
593        # Scanning streams
594        CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
595        Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
596        CD_end_scan = ~lex.CD_end & EOF_mask
597        PI_end_scan = ~lex.PI_end & EOF_mask
598        #
599        # Initiate the scan
600        CtCDPI_Cursor = 1
601        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
602        CtCDPI_Cursor &= EOF_mask
603        while CtCDPI_Cursor:
604                CtCDPI_starts |= CtCDPI_Cursor
605                PI_Cursor = CtCDPI_Cursor & lex.PI_start
606                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
607                CD_Cursor = CD_Ct_Cursor & lex.LBracket
608                Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
609                PI_starts |= PI_Cursor
610                CD_starts |= CD_Cursor
611                Ct_starts |= Ct_Cursor
612                Ct_Cursor = bitutil.Advance(Ct_Cursor)
613                Ct_end_scan |= Ct_Cursor
614                #PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
615                PI_name_end = bitutil.ScanThru( bitutil.Advance(PI_Cursor), lex.NameScan)
616                PI_name_ends |= PI_name_end
617                CT_callouts.PI_name |= PI_name_end - PI_Cursor
618                PI_Cursor = bitutil.ScanThru(PI_name_end, PI_end_scan)
619                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
620                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
621                PI_ends |= PI_Cursor
622                CD_ends |= CD_Cursor
623                Ct_ends |= Ct_Cursor
624                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
625                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
626                CtCDPI_Cursor &= EOF_mask
627        # End of loop: no remaining CtCDPI_Cursor
628        #Not needed for xmlwf
629        #CT_callouts.CD_span = CD_ends - CD_starts
630        #CT_callouts.Ct_span = Ct_ends - Ct_starts
631        #CT_callouts.PI_span = PI_ends - PI_starts
632       
633        CT_callouts.CtCDPI_mask = bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
634        CT_callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
635        CT_callouts.error |= bitutil.Advance(PI_name_ends & ~ lex.WS) & ~ lex.PI_end
636        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
637        CT_callouts.error |= CT_callouts.CtCDPI_mask &~ EOF_mask
638        ########## END OF MANUAL INLINING
639       
640        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
641        # All remaining "<" must be tag start characters; parse tags.
642        #tags = parse_tags(lex, CT_callouts.CtCDPI_mask, EOF_mask)
643
644        #callouts = tag_callouts()
645       
646        # Delimiters for scans.
647        DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
648        SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
649        AttListDelim = lex.Slash | lex.RAngle
650       
651        # Start the parallel parsing by inspecting the character
652        # after the opening "<" of a tag.
653        LAngleFollow = bitutil.Advance(lex.LAngle) &~ CT_callouts.CtCDPI_mask
654        ElemNamePositions = LAngleFollow & ~lex.Slash
655        EndTagSeconds = LAngleFollow & lex.Slash
656       
657        # Start Tag/Empty Element Tag Parsing
658
659        # Advance all cursors by scanning through the tag name.
660        ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
661        # Must have at least one name character for a legal start tag.
662        # Mark any occurrences of null names as errors.
663        ParseError = ElemNamePositions & ElemNameFollows
664        callouts.ElemNames = ElemNameFollows - ElemNamePositions
665       
666        # Initialize the accumulators for attribute name and value positions.
667        AttNameStarts = 0 
668        AttNameFollows = 0
669        EqToCheck = 0
670        AttValStarts = 0
671        AttValEnds = 0
672        AttValFollows = 0
673
674        # After the element name, there may or may not be an attlist.
675        AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
676        AttListEnd = AfterWS & AttListDelim
677        AttNameStart = AfterWS & ~AttListDelim
678        # At least one WS character is required between ElemNames and AttNames.
679        ParseError |= ElemNameFollows & AttNameStart
680
681        #
682        # The following loop iterates through attributes within a start tag.
683        # Because all start tags are processed in parallel, the number of
684        # iterations is the maximum number of attributes found in any one
685        # start tag, plus one.
686        while AttNameStart:
687                AttNameStarts |= AttNameStart
688                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
689                AttNameFollows |= AttNameFollow
690                # Scan through WS to the expected '=' delimiter.
691                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
692                EqToCheck |= EqExpected
693                AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
694                AttValStarts |= AttValPos
695                DQuoteAttVal = AttValPos & lex.DQuote
696                SQuoteAttVal = AttValPos & lex.SQuote
697                DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
698                SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
699                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
700                AttValEnds |= AttValEnd
701                AttValFollow = bitutil.Advance(AttValEnd)
702                AttValFollows |= AttValFollow
703                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
704                AttListEnd |= AfterWS & AttListDelim
705                AttNameStart = AfterWS & ~AttListDelim
706
707        # No more attribute values to process when AttNameStart == 0.
708        # Not needed for xmlwf
709        callouts.AttNames = AttNameFollows - AttNameStarts
710        #callouts.AttVals = AttValFollows - AttValStarts
711        STagEnds = AttListEnd & lex.RAngle
712        # Mark any "/" characters found as the ends of empty element tags.
713        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
714        # Not needed for xmlwf
715        #callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
716       
717        # Check for errors.
718        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
719        ParseError |= AttNameStarts & AttNameFollows # Null AttName
720        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
721        ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
722        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
723        ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
724
725        # End Tag Parsing
726        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
727        ParseError |= EndTagEnds & ~lex.RAngle
728        # Not needed for xmlwf
729        #callouts.EndTags = EndTagEnds - EndTagSeconds
730        callouts.error = ParseError
731        ########## END OF MANUAL INLINING
732
733
734
735
736        # All remaining "&" must be reference start characters; parse them.
737        # INLINED: refs = parse_refs(lex, CT_callouts.CtCDPI_mask)
738        refs = ref_callouts()
739        Ref2 = lex.RefStart_scope &~ CT_callouts.CtCDPI_mask
740        if Ref2:
741                NumRef2 = Ref2 & lex.Hash
742                GenRef2 = Ref2 &~ lex.Hash
743                NumRef3 = bitutil.Advance(NumRef2)
744                HexRef3 = NumRef3 & lex.x
745                DecRef3 = NumRef3 &~ lex.x
746                HexRef4 = bitutil.Advance(HexRef3) 
747                GenRefEnds = bitutil.ScanThru(GenRef2, lex.NameScan)
748                DecRefEnds = bitutil.ScanThru(DecRef3, lex.Digit)
749                HexRefEnds = bitutil.ScanThru(HexRef4, lex.Hex)
750                # Error checks
751                # At least one digit required for DecRef, one hex digit for HexRef.
752                error1 = DecRef3 &~ lex.Digit
753                error2 = HexRef4 &~ lex.Hex
754                # Semicolon terminator required (also covers unterminated at EOF).
755                error3 = (GenRefEnds | DecRefEnds | HexRefEnds) &~ lex.Semicolon
756                refs.GenRefs = GenRefEnds - GenRef2
757                refs.DecRefs = DecRefEnds - DecRef3
758                refs.HexRefs = HexRefEnds - HexRef4
759                # Mark references for deletion, but leave the trailing semicolon as
760                # the point for insertion of the "expansion" text (most often a
761                # single character).
762                #refs.delmask = (GenRefEnds | DecRefEnds | HexRefEnds) - lex.RefStart
763                refs.error = error1 | error2 | error3
764
765
766        # Ensure that no occurrence of ]]> occurs outside of markup.
767        CD_end_error = validate_no_CD_end(lex, CT_callouts, callouts)
768
769        # Convert to UTF-16 bit streams.
770        #(u16hi, u16lo, u16delmask) = u8u16.u8u16(u8, bit)
771
772        # Consolidate and check for errors
773        error_mask = u8.error | xmlchar_error | CT_callouts.error | callouts.error | CD_end_error | refs.error
774
775        # Consolidate the deletion_masks
776        #delmask = control.CRLF | refs.delmask | u16delmask # | CT_callouts.CDATA_delimiters
777        #Not needed for xmlwf
778        delmask = control.CRLF | refs.delmask  # | CT_callouts.CDATA_delimiters
779       
780        qname_stream =  callouts.ElemNames | callouts.AttNames
781        ncname_stream = CT_callouts.PI_name | refs.GenRefs
782        name_stream = qname_stream | ncname_stream
783        name_start = name_stream &~ bitutil.Advance(name_stream)
784        name_start_check = name_start & ~lex.ASCII_name_start
785        name_check = (name_stream &~ name_start | nmtoken_stream) & ~lex.ASCII_name_char & ~u8.suffix
786
787        #return (CT_callouts, callouts, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
788        return (CT_callouts, callouts, refs, delmask, error, lex, EOF_mask, name_check, name_start_check, control)
789
790def demo_parabix(u8data):
791
792        lgth = len(u8data)
793       
794        (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask) = parabix_parse(u8data)
795        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
796                              ('input high nybbles', bitutil.high_nybble_stream(u8data)), 
797                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
798                              ('CD_span', bitutil.bitstream2string(markup1.CD_span, lgth)),
799                              ('Ct_span', bitutil.bitstream2string(markup1.Ct_span, lgth)),
800                              ('PI_span', bitutil.bitstream2string(markup1.PI_span, lgth)),
801                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
802                              ('entity refs', bitutil.bitstream2string(refs.GenRefs, lgth)),
803                              ('decimal char refs', bitutil.bitstream2string(refs.DecRefs, lgth)),
804                              ('hex char refs', bitutil.bitstream2string(refs.HexRefs, lgth)),
805                              ('element names', bitutil.bitstream2string(tags.ElemNames, lgth)),
806                              ('attribute names', bitutil.bitstream2string(tags.AttNames, lgth)),
807                              ('attribute values', bitutil.bitstream2string(tags.AttVals, lgth)),
808                              ('empty tag marks', bitutil.bitstream2string(tags.EmptyTagMarks, lgth)),
809                              ('end tags', bitutil.bitstream2string(tags.EndTags, lgth)),
810                              ('start/empty tags', bitutil.bitstream2string(tags.Tags, lgth)),
811                              ('delmask', bitutil.bitstream2string(delmask, lgth)),
812                              ('u16delmask', bitutil.bitstream2string(u16delmask, lgth)),
813                              ('errors', bitutil.bitstream2string(error, lgth+1))])
814
815def demo_u16delmask(u8data):
816
817        u8len = len(u8data)
818       
819        # Transpose to parallel bit streams and prepare an EOF mask.
820        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
821
822        # Classify bytes for UTF-8 processing, whitespace and control
823        # processing and XML lexical analysis.
824        (u8, control, lex) = byteclass.classify_bytes(bit)
825
826        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
827        u8 = u8u16.validate_utf8(u8)   
828       
829        # Convert to UTF-16 bit streams.
830        (u16hi, u16lo, delmask) = u8u16.u8u16(u8, bit)
831       
832        # Inverse transpose
833        U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
834        U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
835       
836        # Construct UTF-16 data buffer
837        bytes = bitutil.merge_bytes(U16L, U16H)
838       
839        U16data = bytes.decode('utf16')
840       
841        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
842                                ('u16delmask', bitutil.bitstream2string(delmask, u8len)),               
843                                    ('errors', bitutil.bitstream2string(u8.error, u8len+1))])
844        return
845
846if __name__ == "__main__":
847        import doctest
848        doctest.testmod()
849       
850        if len(sys.argv) > 1:
851                u8data = bitutil.readfile(sys.argv[1]) 
852#               demo_validate_xmlchar(u8data)
853#               demo_line_breaks(u8data)
854#               demo_multiliterals(u8data)
855#               demo_CtCDPI(u8data)
856#               demo_refs(u8data)
857#               demo_tags(u8data)
858#               demo_validate_no_CD_end(u8data)         
859#               demo_u16delmask(u8data)         
860                demo_parabix(u8data)
861#               demo_u16delmask(u8data)
862        else:
863                print("Usage: python parabix2.py <file>")       
864               
865 
866       
867       
Note: See TracBrowser for help on using the repository browser.