source: proto/parabix2/parabix2_compilable.py @ 452

Last change on this file since 452 was 448, checked in by cameron, 9 years ago

Revert shift1opt

File size: 32.3 KB
RevLine 
[409]1# -*- coding: utf-8 -*-
2#
3# parabix2.py
4#
5# Parallel XML Parsing with Bitstream Addition
6# - Complete prototype for all bitstream computations in Parabix2
7#
8# Robert D. Cameron
9# August 20, 2009
10#
11#----------------------------------------------------------------------------
12#
13# We use python's unlimited precision integers for unbounded bit streams.
14# This permits simple logical operations on the entire stream.
15# Assumption: bitstreams are little-endian (e.g., as on x86).
16#
17#----------------------------------------------------------------------------
18#
19
20
21#import bitutil
22
23import byteclass
24
25import u8u16
26
27#import sys
28
29def validate_xmlchar(u8, control, lex, EOF_mask):
30        r"""Compute an error stream marking characters illegal in XML:
31        (1) Control characters in the range 0x00-0x1F except HT, LF, CR
32        (2) OxFFFF and OxFFFE, having UTF-8 encodings 0xEF 0xBF 0XBF and 0xEF 0xBF 0xBE.
33
34        >>> demo_validate_xmlchar('plaintext (good: \x09) (bad: \x03) (bad \xEF\xBF\xBF) (good \xEF\xBF\xBC)')
35        input high nybbles: 7666676772266663202226663202226662ebb22266662ebb2
36        input low nybbles : 0c19e4584087ff4a09908214a039082140fff9087ff40ffc9
37        illegal XML chars : __________________________1_________1_____________
38"""
[438]39        EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
[409]40        ret = (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
41        return ret
42
43
44def demo_validate_xmlchar(u8data):
45        lgth = len(u8data)
46        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
47        (u8, control, lex) = byteclass.classify_bytes(bit)
48        bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)), 
49                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
50                              ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex, EOF_mask), lgth+1))])
51
52def normalize_line_breaks(control, bit):
53        r"""Convert CRs to LFs and mark CRLF occurrences for deletion.
54
55        >>> demo_line_breaks('ab \r\n  cd \r  ef \r ')
56        input high nybbles: 662002266202266202
57        input low nybbles : 120da00340d00560d0
58        CR                : ___1______1_____1_
59        LF                : ____1_____________
60        CRLF              : ____1_____________
61"""
[438]62        control.CRLF = control.CR_scope & control.LF
[409]63        # Convert CRs to LFs (flip bits 5, 6 and 7 with xor).
64        bit[5] ^= control.CR
65        bit[6] ^= control.CR
66        bit[7] ^= control.CR
67        return (control, bit)
68
69def demo_line_breaks(u8data):
70        lgth = len(u8data)
71        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
72        (u8, control, lex) = byteclass.classify_bytes(bit)
73        (control, bit) = normalize_line_breaks(control, bit)
74        bitutil.print_aligned_u8_byte_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)), 
75                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
76                              ('CR', bitutil.bitstream2string(control.CR, lgth)),
77                              ('LF', bitutil.bitstream2string(control.LF, lgth)),
78                              ('CRLF', bitutil.bitstream2string(control.CRLF, lgth))])
79
80
81
82
83
84def add_multiliterals(lex):
85        """Extend the byte-based lexical item streams for some important
86        multibyte literals.
87       
88        >>> demo_multiliterals("  <?php?>  <!--  -->  <![CDATA[  ]]> ")
89        input data  :   <?php?>  <!--  -->  <![CDATA[  ]]>
90        PI_start    : ___1_________________________________
91        CtCD_start  : ____________1__________1_____________
92        EndTag_start: _____________________________________
93        CD_end      : ___________________________________1_
94        DoubleHyphen: ______________1___1__________________
95        PI_end      : ________1____________________________
96        """
97
[438]98        lex.PI_start = lex.LAngle_scope & lex.QMark
99        lex.CtCD_start = lex.LAngle_scope & lex.Exclam
100        lex.EndTag_start = lex.LAngle_scope & lex.Slash
101        lex.CD_end = bitutil.Advance(lex.RBracket_scope & lex.RBracket) & lex.RAngle
102        lex.DoubleHyphen = lex.Hyphen_scope & lex.Hyphen
103        lex.PI_end = lex.QMark_scope & lex.RAngle
[409]104        return lex
105
106def demo_multiliterals(u8data):
107        lgth = len(u8data)
108        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
109        (u8, control, lex) = byteclass.classify_bytes(bit)
110        lex = add_multiliterals(lex)
111        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
112                              ('PI_start', bitutil.bitstream2string(lex.PI_start, lgth)),
113                              ('CtCD_start', bitutil.bitstream2string(lex.CtCD_start, lgth)),
114                              ('EndTag_start', bitutil.bitstream2string(lex.EndTag_start, lgth)),
115                              ('CD_end', bitutil.bitstream2string(lex.CD_end, lgth)),
116                              ('DoubleHyphen', bitutil.bitstream2string(lex.DoubleHyphen, lgth)),
117                              ('PI_end', bitutil.bitstream2string(lex.PI_end, lgth))])
118
119class CtCDPI_callouts:
120        CD_span = 0
121        Ct_span = 0
122        PI_mask = 0
123        CtCDPI_mask = 0
124        error = 0
125       
126def parse_CtCDPI(lex, EOF_mask):
127        """Parse all comments, CDATA sections and processing instructions.
128       
129        Return bitstreams marking the extent of these markup items,
130        excluding initial and final bracketting.
131       
132        >>> demo_CtCDPI(' <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]> ')
133        input data :  <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]>
134        CD_span    : ______________________________11111111111111111111111__
135        Ct_span    : _____________111111111111______________________________
136        PI_span    : __11111________________________________________________
137        CtCDPI_mask: __111111___111111111111111___1111111111111111111111111_
138        error      : ________________________________________________________
139       
140        Comments are terminated by double-hyphen; immediately require closing ">".
141       
142        >>> demo_CtCDPI(' <!--  <?php?>  --   <!-- -->')
143        input data :  <!--  <?php?>  --   <!-- -->
144        CD_span    : _____________________________
145        Ct_span    : ____11111111111111______1111_
146        PI_span    : _____________________________
147        CtCDPI_mask: __11111111111111111___1111111
148        error      : __________________1___________
149
150
151
152"""
153        callouts = CtCDPI_callouts()
154        PI_starts = 0
155        PI_ends = 0
156        Ct_starts = 0
157        Ct_ends = 0
158        CD_starts = 0
159        CD_ends = 0
160        CtCDPI_starts = 0
161        # Scanning streams
162        CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
163        Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
164        CD_end_scan = ~lex.CD_end & EOF_mask
165        PI_end_scan = ~lex.PI_end & EOF_mask
166        #
167        # Initiate the scan
168        CtCDPI_Cursor = 1
169        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
170        CtCDPI_Cursor &= EOF_mask
171        while CtCDPI_Cursor:
172                CtCDPI_starts |= CtCDPI_Cursor
173                PI_Cursor = CtCDPI_Cursor & lex.PI_start
174                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
175                CD_Cursor = CD_Ct_Cursor & lex.LBracket
176                Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
177                PI_starts |= PI_Cursor
178                CD_starts |= CD_Cursor
179                Ct_starts |= Ct_Cursor
180                Ct_Cursor = bitutil.Advance(Ct_Cursor)
181                Ct_end_scan |= Ct_Cursor
182                PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
183                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
184                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
185                PI_ends |= PI_Cursor
186                CD_ends |= CD_Cursor
187                Ct_ends |= Ct_Cursor
188                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
189                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
190                CtCDPI_Cursor &= EOF_mask
191        # End of loop: no remaining CtCDPI_Cursor
192        callouts.CD_span = CD_ends - CD_starts
193        callouts.Ct_span = Ct_ends - Ct_starts
194        callouts.PI_span = PI_ends - PI_starts
195       
196        callouts.CtCDPI_mask |= bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
197        callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
198        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
199        callouts.error |= callouts.CtCDPI_mask &~ EOF_mask
200        return callouts
201
202def demo_CtCDPI(u8data):
203        lgth = len(u8data)
204        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
205        (u8, control, lex) = byteclass.classify_bytes(bit)
206        lex = add_multiliterals(lex)
207        markup = parse_CtCDPI(lex, EOF_mask)
208        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
209                              ('CD_span', bitutil.bitstream2string(markup.CD_span, lgth)),
210                              ('Ct_span', bitutil.bitstream2string(markup.Ct_span, lgth)),
211                              ('PI_span', bitutil.bitstream2string(markup.PI_span, lgth)),
212                              ('CtCDPI_mask', bitutil.bitstream2string(markup.CtCDPI_mask, lgth)),
213                              ('error', bitutil.bitstream2string(markup.error, lgth+1))])
214
215
216class ref_callouts:
217        GenRefs = 0
218        DecRefs = 0
219        HexRefs = 0
220        delmask = 0
221        error = 0
222
223def parse_refs(lex, CtCDPI_mask):
224        """Parse and call out all general and character references.
225        Mark all but the closing semicolon for deletion.
226       
227        >>> demo_refs(" &gt;  &#13;  &#x0a;  ")
228        input data       :  &gt;  &#13;  &#x0a; 
229        entity refs      : __11__________________
230        decimal char refs: _________11___________
231        hex char refs    : _________________11___
232        ref delmask      : _111___1111___11111___
233        errors           : _______________________
234
235        Empty numeric references are reported as errors.
236        >>> demo_refs(" &#;       &#x; ")
237        input data       :  &#;       &#x;
238        entity refs      : ________________
239        decimal char refs: ________________
240        hex char refs    : ________________
241        ref delmask      : _11________111__
242        errors           : ___1__________1__
243
244        Improperly terminated or unterminated references (lacking ";") are also errors.
245        >>> demo_refs("  &gt:  &#456a;  &#xab:  &unterminated")
246        input data       :   &gt:  &#456a;  &#xab:  &unterminated
247        entity refs      : ___111____________________111111111111
248        decimal char refs: __________111_________________________
249        hex char refs    : ____________________11________________
250        ref delmask      : __1111__11111____11111___1111111111111
251        errors           : ______1______1________1_______________1
252"""
253        CallOuts = ref_callouts()
[438]254        Ref2 = lex.RefStart_scope &~ CtCDPI_mask
[409]255        NumRef2 = Ref2 & lex.Hash
256        GenRef2 = Ref2 &~ lex.Hash
257        NumRef3 = bitutil.Advance(NumRef2)
258        HexRef3 = NumRef3 & lex.x
259        DecRef3 = NumRef3 &~ lex.x
260        HexRef4 = bitutil.Advance(HexRef3) 
261        GenRefEnds = bitutil.ScanThru(GenRef2, lex.NameScan)
262        DecRefEnds = bitutil.ScanThru(DecRef3, lex.Digit)
263        HexRefEnds = bitutil.ScanThru(HexRef4, lex.Hex)
264        # Error checks
265        # At least one digit required for DecRef, one hex digit for HexRef.
266        Error = DecRef3 &~ lex.Digit
267        Error |= HexRef4 &~ lex.Hex
268        # Semicolon terminator required (also covers unterminated at EOF).
269        Error |= (GenRefEnds | DecRefEnds | HexRefEnds) &~ lex.Semicolon
270        CallOuts.GenRefs = GenRefEnds - GenRef2
271        CallOuts.DecRefs = DecRefEnds - DecRef3
272        CallOuts.HexRefs = HexRefEnds - HexRef4
273        # Mark references for deletion, but leave the trailing semicolon as
274        # the point for insertion of the "expansion" text (most often a
275        # single character).
276        CallOuts.delmask = (GenRefEnds | DecRefEnds | HexRefEnds) - lex.RefStart
277        CallOuts.error = Error
278        return CallOuts
279
280def demo_refs(u8data):
281        lgth = len(u8data)
282        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
283        (u8, control, lex) = byteclass.classify_bytes(bit)
284        callouts = parse_refs(lex, 0)
285        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
286                              ('entity refs', bitutil.bitstream2string(callouts.GenRefs, lgth)),
287                              ('decimal char refs', bitutil.bitstream2string(callouts.DecRefs, lgth)),
288                              ('hex char refs', bitutil.bitstream2string(callouts.HexRefs, lgth)),
289                              ('ref delmask', bitutil.bitstream2string(callouts.delmask, lgth)),
290                              ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
291
292
293class tag_callouts:
294        ElemNames = 0
295        AttNames = 0
296        AttVals = 0
297        Tags = 0
298        EmptyTagEnds = 0
299        EndTags = 0
300        error = 0
301       
302        # POTENTIAL ADDITIONAL FIELDS
303        # StartTagEnds = 0
304        # EmptyTagEnds = 0     
305        # EndTagEnds = 0
306
307def parse_tags(lex, CtCDPI_mask, EOF_mask):
308        """Parse start, empty and end tags, calling out element names, attribute
309        names and values, empty tag positions, and tag extents.
310
311        >>> demo_tags("<root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>")
312        input data      : <root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>
313        element names   : _1111__11___________11_______________________________1111__________________
314        attribute names : _______________________11_______11________________________1111_____________
315        attribute values: __________________________11111______11111_____________________111_________
316        empty tag marks : ___________________________________________________________________1_______
317        end tags        : _______________111______________________________111__________________11111_
318        start/empty tags: _1111__11___________1111111111111111111111___________11111111111111________
319        errors          : ____________________________________________________________________________
320
321        Attributes can use double quotes.
322
323        >>> demo_tags('<dquote_atts a1="1234" attribute2="4321"/>')
324        input data      : <dquote_atts a1="1234" attribute2="4321"/>
325        element names   : _11111111111______________________________
326        attribute names : _____________11________1111111111_________
327        attribute values: ________________111111____________111111__
328        empty tag marks : _________________________________________1
329        end tags        : __________________________________________
330        start/empty tags: _1111111111111111111111111111111111111111_
331        errors          : ___________________________________________
332
333        Syntax errors of various types are identified with the error stream.
334
335        1. Element name missing errors.
336
337        >>> demo_tags("< noname='flawed'/> ")
338        input data      : < noname='flawed'/>
339        element names   : ____________________
340        attribute names : __111111____________
341        attribute values: _________11111111___
342        empty tag marks : __________________1_
343        end tags        : ____________________
344        start/empty tags: _11111111111111111__
345        errors          : _1___________________
346
347        2. Missing attribute names.
348
349        >>> demo_tags("<noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>")
350        input data      : <noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>
351        element names   : _11111_______________1111111___________________________________
352        attribute names : _____________________________11________________________________
353        attribute values: ________11111111________________111111___11111_________________
354        empty tag marks : _________________1_____________________________________________
355        end tags        : ______________________________________________________11111111_
356        start/empty tags: _1111111111111111____1111111111111111111111111_________________
357        errors          : _______1_______________________________1________________________
358
359        3. Missing or incorrect = sign.
360
361        >>> demo_tags('<errata plusforeq+"5678" noequals"90" />')
362        input data      : <errata plusforeq+"5678" noequals"90" />
363        element names   : _111111_________________________________
364        attribute names : ________111111111________11111111_______
365        attribute values: __________________111111__________111111
366        empty tag marks : ________________________________________
367        end tags        : ________________________________________
368        start/empty tags: _111111111111111111111111111111111111111
369        errors          : _________________1_______________11______
370
371        4.  Missing whitespace
372
373        >>> demo_tags("<jammed att='value'att2='v2' />")
374        input data      : <jammed att='value'att2='v2' />
375        element names   : _111111________________________
376        attribute names : ________111________1111________
377        attribute values: ____________1111111_____1111___
378        empty tag marks : ______________________________1
379        end tags        : _______________________________
380        start/empty tags: _11111111111111111111111111111_
381        errors          : ___________________1____________
382
383        5.  Extra whitespace in an empty tag.
384
385        >>> demo_tags("<extrawhite / >")
386        input data      : <extrawhite / >
387        element names   : _1111111111____
388        attribute names : _______________
389        attribute values: _______________
390        empty tag marks : _____________1_
391        end tags        : _______________
392        start/empty tags: _111111111111__
393        errors          : _____________1__
394
395        6.  Unterminated or incorrectly terminated attribute values
396
397        >>> demo_tags("<badattvalues a='blud<   b='455>   ")
398        input data      : <badattvalues a='blud<   b='455>   
399        element names   : _111111111111______________________
400        attribute names : ______________1__________1_________
401        attribute values: ________________111111_____11111111
402        empty tag marks : ___________________________________
403        end tags        : ___________________________________
404        start/empty tags: _111111111111111111111_111111111111
405        errors          : _____________________11____________1
406
407        7.  Unterminated tags
408
409        >>> demo_tags("<unterminated a='245'  ")
410        input data      : <unterminated a='245' 
411        element names   : _111111111111__________
412        attribute names : ______________1________
413        attribute values: ________________11111__
414        empty tag marks : _______________________
415        end tags        : _______________________
416        start/empty tags: _1111111111111111111111
417        errors          : _______________________1
418
419"""
420        callouts = tag_callouts()
421       
422        # Delimiters for scans.
423        DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
424        SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
425        AttListDelim = lex.Slash | lex.RAngle
426       
427        # Start the parallel parsing by inspecting the character
428        # after the opening "<" of a tag.
[438]429        LAngleFollow = lex.LAngle_scope &~ CtCDPI_mask
[409]430        ElemNamePositions = LAngleFollow & ~lex.Slash
431        EndTagSeconds = LAngleFollow & lex.Slash
432       
433        # Start Tag/Empty Element Tag Parsing
434
435        # Advance all cursors by scanning through the tag name.
436        ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
437        # Must have at least one name character for a legal start tag.
438        # Mark any occurrences of null names as errors.
439        ParseError = ElemNamePositions & ElemNameFollows
440        callouts.ElemNames = ElemNameFollows - ElemNamePositions
441       
442        # Initialize the accumulators for attribute name and value positions.
443        AttNameStarts = 0 
444        AttNameFollows = 0
445        EqToCheck = 0
446        AttValStarts = 0
447        AttValEnds = 0
448        AttValFollows = 0
449
450        # After the element name, there may or may not be an attlist.
451        AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
452        AttListEnd = AfterWS & AttListDelim
453        AttNameStart = AfterWS & ~AttListDelim
454        # At least one WS character is required between ElemNames and AttNames.
455        ParseError |= ElemNameFollows & AttNameStart
456
457        #
458        # The following loop iterates through attributes within a start tag.
459        # Because all start tags are processed in parallel, the number of
460        # iterations is the maximum number of attributes found in any one
461        # start tag, plus one.
462        while AttNameStart:
463                AttNameStarts |= AttNameStart
464                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
465                AttNameFollows |= AttNameFollow
466                # Scan through WS to the expected '=' delimiter.
467                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
468                EqToCheck |= EqExpected
469                AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
470                AttValStarts |= AttValPos
471                DQuoteAttVal = AttValPos & lex.DQuote
472                SQuoteAttVal = AttValPos & lex.SQuote
473                DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
474                SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
475                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
476                AttValEnds |= AttValEnd
477                AttValFollow = bitutil.Advance(AttValEnd)
478                AttValFollows |= AttValFollow
479                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
480                AttListEnd |= AfterWS & AttListDelim
481                AttNameStart = AfterWS & ~AttListDelim
482
483        # No more attribute values to process when AttNameStart == 0.
484
485        callouts.AttNames = AttNameFollows - AttNameStarts
486        callouts.AttVals = AttValFollows - AttValStarts
487        STagEnds = AttListEnd & lex.RAngle
488        # Mark any "/" characters found as the ends of empty element tags.
489        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
490        callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
491
492        # Check for errors.
493        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
494        ParseError |= AttNameStarts & AttNameFollows # Null AttName
495        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
496        ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
497        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
498        ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
499
500        # End Tag Parsing
501        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
502        ParseError |= EndTagEnds & ~lex.RAngle
503        callouts.EndTags = EndTagEnds - EndTagSeconds
504        callouts.error = ParseError
505
506        # POTENTIAL ADDITIONAL FIELDS
507        # callouts.StartTagEnds = STagEnds
508        # callouts.EmptyTagEnds = bitutil.Advance(callouts.EmptyTagMarks)
509        # callouts.EndTagEnds = EndTagEnds
510       
511        return callouts
512
513def demo_tags(u8data):
514        lgth = len(u8data)
515        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
516        (u8, control, lex) = byteclass.classify_bytes(bit)
517        lex = add_multiliterals(lex)
518        markup1 = parse_CtCDPI(lex, EOF_mask)
519        callouts = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
520        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
521                              ('element names', bitutil.bitstream2string(callouts.ElemNames, lgth)),
522                              ('attribute names', bitutil.bitstream2string(callouts.AttNames, lgth)),
523                              ('attribute values', bitutil.bitstream2string(callouts.AttVals, lgth)),
524                              ('empty tag marks', bitutil.bitstream2string(callouts.EmptyTagMarks, lgth)),
525                              ('end tags', bitutil.bitstream2string(callouts.EndTags, lgth)),
526                              ('start/empty tags', bitutil.bitstream2string(callouts.Tags, lgth)),
527                              ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
528
529
530
531def validate_no_CD_end(lex, markup1, tags):
532        """Find illegal occurrences of ]]> in text (outside of markup).
533
534        >>> demo_validate_no_CD_end(' <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?> ')
535        input data :  <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?>
536        CtCDPI_mask: __1111111111111111__111111111111111_______________________________11111111111_
537        tags       : __________________________________________1111111111111111____________________
538        illegal ]]>: _______________________________________1______________________1_______________
539"""
540        ret = lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
541        return ret
542
543def demo_validate_no_CD_end(u8data):
544        lgth = len(u8data)
545        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
546        (u8, control, lex) = byteclass.classify_bytes(bit)
547        lex = add_multiliterals(lex)
548        markup1 = parse_CtCDPI(lex, EOF_mask)
549        tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
550        error = validate_no_CD_end(lex, markup1, tags)
551        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
552                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
553                              ('tags', bitutil.bitstream2string(tags.Tags, lgth)),
554                              ('illegal ]]>', bitutil.bitstream2string(error, lgth))])
555
556
557
558def main(u8data):
559        # Transpose to parallel bit streams and prepare an EOF mask.
560        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
561
562        # Classify bytes for UTF-8 processing, whitespace and control
563        # processing and XML lexical analysis.
[448]564        (u8, control, lex) = byteclass.classify_bytes(bit)
565        #(u8, control, lex) = byteclass.classify_bytes_with_shift1opt(bit)
[409]566
567        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
568        u8 = u8u16.validate_utf8(u8)
569
570        # Rule out the illegal characters for XML.
571        xmlchar_error = validate_xmlchar(u8, control, lex, EOF_mask)
572
573        # Find and normalize bare CR or CRLF combinations.
574        (control, bit) = normalize_line_breaks(control, bit)
575
576        # Compute XML multilterals such as <?, </, --, ]]>.
577        lex = add_multiliterals(lex)
578       
579       
580        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
581        # Parse all comments, CDATA sections and processing instructions.
582        #markup1 = parse_CtCDPI(lex, EOF_mask)
583        #CT_callouts = CtCDPI_callouts()
584        PI_starts = 0
585        PI_ends = 0
586        Ct_starts = 0
587        Ct_ends = 0
588        CD_starts = 0
589        CD_ends = 0
590        CtCDPI_starts = 0
591        # Scanning streams
592        CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
593        Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
594        CD_end_scan = ~lex.CD_end & EOF_mask
595        PI_end_scan = ~lex.PI_end & EOF_mask
596        #
597        # Initiate the scan
598        CtCDPI_Cursor = 1
599        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
600        CtCDPI_Cursor &= EOF_mask
601        while CtCDPI_Cursor:
602                CtCDPI_starts |= CtCDPI_Cursor
603                PI_Cursor = CtCDPI_Cursor & lex.PI_start
604                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
605                CD_Cursor = CD_Ct_Cursor & lex.LBracket
606                Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
607                PI_starts |= PI_Cursor
608                CD_starts |= CD_Cursor
609                Ct_starts |= Ct_Cursor
610                Ct_Cursor = bitutil.Advance(Ct_Cursor)
611                Ct_end_scan |= Ct_Cursor
612                PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
613                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
614                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
615                PI_ends |= PI_Cursor
616                CD_ends |= CD_Cursor
617                Ct_ends |= Ct_Cursor
618                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
619                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
620                CtCDPI_Cursor &= EOF_mask
621        # End of loop: no remaining CtCDPI_Cursor
622        CT_callouts.CD_span = CD_ends - CD_starts
623        CT_callouts.Ct_span = Ct_ends - Ct_starts
624        CT_callouts.PI_span = PI_ends - PI_starts
625       
[412]626        CT_callouts.CtCDPI_mask = bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
[409]627        CT_callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
628        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
629        CT_callouts.error |= CT_callouts.CtCDPI_mask &~ EOF_mask
630        ########## END OF MANUAL INLINING
631       
632        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
633        # All remaining "<" must be tag start characters; parse tags.
634        #tags = parse_tags(lex, CT_callouts.CtCDPI_mask, EOF_mask)
635
636        #callouts = tag_callouts()
637       
638        # Delimiters for scans.
639        DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
640        SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
641        AttListDelim = lex.Slash | lex.RAngle
642       
643        # Start the parallel parsing by inspecting the character
644        # after the opening "<" of a tag.
645        LAngleFollow = bitutil.Advance(lex.LAngle) &~ CT_callouts.CtCDPI_mask
646        ElemNamePositions = LAngleFollow & ~lex.Slash
647        EndTagSeconds = LAngleFollow & lex.Slash
648       
649        # Start Tag/Empty Element Tag Parsing
650
651        # Advance all cursors by scanning through the tag name.
652        ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
653        # Must have at least one name character for a legal start tag.
654        # Mark any occurrences of null names as errors.
655        ParseError = ElemNamePositions & ElemNameFollows
656        callouts.ElemNames = ElemNameFollows - ElemNamePositions
657       
658        # Initialize the accumulators for attribute name and value positions.
659        AttNameStarts = 0 
660        AttNameFollows = 0
661        EqToCheck = 0
662        AttValStarts = 0
663        AttValEnds = 0
664        AttValFollows = 0
665
666        # After the element name, there may or may not be an attlist.
667        AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
668        AttListEnd = AfterWS & AttListDelim
669        AttNameStart = AfterWS & ~AttListDelim
670        # At least one WS character is required between ElemNames and AttNames.
671        ParseError |= ElemNameFollows & AttNameStart
672
673        #
674        # The following loop iterates through attributes within a start tag.
675        # Because all start tags are processed in parallel, the number of
676        # iterations is the maximum number of attributes found in any one
677        # start tag, plus one.
678        while AttNameStart:
679                AttNameStarts |= AttNameStart
680                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
681                AttNameFollows |= AttNameFollow
682                # Scan through WS to the expected '=' delimiter.
683                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
684                EqToCheck |= EqExpected
685                AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
686                AttValStarts |= AttValPos
687                DQuoteAttVal = AttValPos & lex.DQuote
688                SQuoteAttVal = AttValPos & lex.SQuote
689                DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
690                SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
691                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
692                AttValEnds |= AttValEnd
693                AttValFollow = bitutil.Advance(AttValEnd)
694                AttValFollows |= AttValFollow
695                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
696                AttListEnd |= AfterWS & AttListDelim
697                AttNameStart = AfterWS & ~AttListDelim
698
699        # No more attribute values to process when AttNameStart == 0.
700
701        callouts.AttNames = AttNameFollows - AttNameStarts
702        callouts.AttVals = AttValFollows - AttValStarts
703        STagEnds = AttListEnd & lex.RAngle
704        # Mark any "/" characters found as the ends of empty element tags.
705        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
706        callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
[423]707       
708       
709        name_stream = callouts.ElemNames | callouts.AttNames
710        name_start = name_stream &~ bitutil.Advance(name_stream)
711        name_start_check = name_start & ~lex.ASCII_name_start
712        name_check = (name_stream &~ name_start | nmtoken_stream) & ~lex.ASCII_name_char & ~u8.suffix
[409]713
714        # Check for errors.
715        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
716        ParseError |= AttNameStarts & AttNameFollows # Null AttName
717        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
718        ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
719        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
720        ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
721
722        # End Tag Parsing
723        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
724        ParseError |= EndTagEnds & ~lex.RAngle
725        callouts.EndTags = EndTagEnds - EndTagSeconds
726        callouts.error = ParseError
727        ########## END OF MANUAL INLINING
728
729
730
731
732        # All remaining "&" must be reference start characters; parse them.
733        refs = parse_refs(lex, CT_callouts.CtCDPI_mask)
734
735        # Ensure that no occurrence of ]]> occurs outside of markup.
736        CD_end_error = validate_no_CD_end(lex, CT_callouts, callouts)
737
738        # Convert to UTF-16 bit streams.
[423]739        #(u16hi, u16lo, u16delmask) = u8u16.u8u16(u8, bit)
[409]740
741        # Consolidate and check for errors
[412]742        error_mask = u8.error | xmlchar_error | CT_callouts.error | callouts.error | CD_end_error | refs.error
[409]743
744        # Consolidate the deletion_masks
[423]745        #delmask = control.CRLF | refs.delmask | u16delmask # | CT_callouts.CDATA_delimiters
746        delmask = control.CRLF | refs.delmask  # | CT_callouts.CDATA_delimiters
[409]747
[423]748        #return (CT_callouts, callouts, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
749        return (CT_callouts, callouts, refs, delmask, error, lex, EOF_mask)
[409]750
751def demo_parabix(u8data):
752
753        lgth = len(u8data)
754       
755        (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask) = parabix_parse(u8data)
756        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
757                              ('input high nybbles', bitutil.high_nybble_stream(u8data)), 
758                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
759                              ('CD_span', bitutil.bitstream2string(markup1.CD_span, lgth)),
760                              ('Ct_span', bitutil.bitstream2string(markup1.Ct_span, lgth)),
761                              ('PI_span', bitutil.bitstream2string(markup1.PI_span, lgth)),
762                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
763                              ('entity refs', bitutil.bitstream2string(refs.GenRefs, lgth)),
764                              ('decimal char refs', bitutil.bitstream2string(refs.DecRefs, lgth)),
765                              ('hex char refs', bitutil.bitstream2string(refs.HexRefs, lgth)),
766                              ('element names', bitutil.bitstream2string(tags.ElemNames, lgth)),
767                              ('attribute names', bitutil.bitstream2string(tags.AttNames, lgth)),
768                              ('attribute values', bitutil.bitstream2string(tags.AttVals, lgth)),
769                              ('empty tag marks', bitutil.bitstream2string(tags.EmptyTagMarks, lgth)),
770                              ('end tags', bitutil.bitstream2string(tags.EndTags, lgth)),
771                              ('start/empty tags', bitutil.bitstream2string(tags.Tags, lgth)),
772                              ('delmask', bitutil.bitstream2string(delmask, lgth)),
773                              ('u16delmask', bitutil.bitstream2string(u16delmask, lgth)),
774                              ('errors', bitutil.bitstream2string(error, lgth+1))])
775
776def demo_u16delmask(u8data):
777
778        u8len = len(u8data)
779       
780        # Transpose to parallel bit streams and prepare an EOF mask.
781        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
782
783        # Classify bytes for UTF-8 processing, whitespace and control
784        # processing and XML lexical analysis.
785        (u8, control, lex) = byteclass.classify_bytes(bit)
786
787        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
788        u8 = u8u16.validate_utf8(u8)   
789       
790        # Convert to UTF-16 bit streams.
791        (u16hi, u16lo, delmask) = u8u16.u8u16(u8, bit)
792       
793        # Inverse transpose
794        U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
795        U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
796       
797        # Construct UTF-16 data buffer
798        bytes = bitutil.merge_bytes(U16L, U16H)
799       
800        U16data = bytes.decode('utf16')
801       
802        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
803                                ('u16delmask', bitutil.bitstream2string(delmask, u8len)),               
804                                    ('errors', bitutil.bitstream2string(u8.error, u8len+1))])
805        return
806
807if __name__ == "__main__":
808        import doctest
809        doctest.testmod()
810       
811        if len(sys.argv) > 1:
812                u8data = bitutil.readfile(sys.argv[1]) 
813#               demo_validate_xmlchar(u8data)
814#               demo_line_breaks(u8data)
815#               demo_multiliterals(u8data)
816#               demo_CtCDPI(u8data)
817#               demo_refs(u8data)
818#               demo_tags(u8data)
819#               demo_validate_no_CD_end(u8data)         
820#               demo_u16delmask(u8data)         
821                demo_parabix(u8data)
822#               demo_u16delmask(u8data)
823        else:
824                print("Usage: python parabix2.py <file>")       
825               
826 
827       
828       
Note: See TracBrowser for help on using the repository browser.