source: proto/parabix2/src/carryq/parabix2_compilable_cq.py @ 1496

Last change on this file since 1496 was 535, checked in by ksherdy, 9 years ago

Add carry queue hand optimized implementation.

File size: 32.9 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# parabix2.py
4#
5# Parallel XML Parsing with Bitstream Addition
6# - Complete prototype for all bitstream computations in Parabix2
7#
8# Robert D. Cameron
9# August 20, 2009
10#
11#----------------------------------------------------------------------------
12#
13# We use python's unlimited precision integers for unbounded bit streams.
14# This permits simple logical operations on the entire stream.
15# Assumption: bitstreams are little-endian (e.g., as on x86).
16#
17#----------------------------------------------------------------------------
18#
19
20
21#import bitutil
22
23import byteclass
24
25import u8u16
26
27#import sys
28
29def validate_xmlchar(u8, control, lex, EOF_mask):
30        r"""Compute an error stream marking characters illegal in XML:
31        (1) Control characters in the range 0x00-0x1F except HT, LF, CR
32        (2) OxFFFF and OxFFFE, having UTF-8 encodings 0xEF 0xBF 0XBF and 0xEF 0xBF 0xBE.
33
34        >>> demo_validate_xmlchar('plaintext (good: \x09) (bad: \x03) (bad \xEF\xBF\xBF) (good \xEF\xBF\xBC)')
35        input high nybbles: 7666676772266663202226663202226662ebb22266662ebb2
36        input low nybbles : 0c19e4584087ff4a09908214a039082140fff9087ff40ffc9
37        illegal XML chars : __________________________1_________1_____________
38"""
39        EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
40        ret = (EF_BF_pending & (u8.xBE | u8.xBF)) | (control.x00_x1F &~ lex.WS & EOF_mask)
41        return ret
42
43
44def demo_validate_xmlchar(u8data):
45        lgth = len(u8data)
46        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
47        (u8, control, lex) = byteclass.classify_bytes(bit)
48        bitutil.print_aligned_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)), 
49                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
50                              ('illegal XML chars', bitutil.bitstream2string(validate_xmlchar(u8, control, lex, EOF_mask), lgth+1))])
51
52def normalize_line_breaks(control, bit):
53        r"""Convert CRs to LFs and mark CRLF occurrences for deletion.
54
55        >>> demo_line_breaks('ab \r\n  cd \r  ef \r ')
56        input high nybbles: 662002266202266202
57        input low nybbles : 120da00340d00560d0
58        CR                : ___1______1_____1_
59        LF                : ____1_____________
60        CRLF              : ____1_____________
61"""
62        control.CRLF = control.CR_scope & control.LF
63        # Convert CRs to LFs (flip bits 5, 6 and 7 with xor).
64        bit[5] ^= control.CR
65        bit[6] ^= control.CR
66        bit[7] ^= control.CR
67        return (control, bit)
68
69def demo_line_breaks(u8data):
70        lgth = len(u8data)
71        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
72        (u8, control, lex) = byteclass.classify_bytes(bit)
73        (control, bit) = normalize_line_breaks(control, bit)
74        bitutil.print_aligned_u8_byte_streams([('input high nybbles', bitutil.high_nybble_stream(u8data)), 
75                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
76                              ('CR', bitutil.bitstream2string(control.CR, lgth)),
77                              ('LF', bitutil.bitstream2string(control.LF, lgth)),
78                              ('CRLF', bitutil.bitstream2string(control.CRLF, lgth))])
79
80
81
82
83
84def add_multiliterals(lex):
85        """Extend the byte-based lexical item streams for some important
86        multibyte literals.
87       
88        >>> demo_multiliterals("  <?php?>  <!--  -->  <![CDATA[  ]]> ")
89        input data  :   <?php?>  <!--  -->  <![CDATA[  ]]>
90        PI_start    : ___1_________________________________
91        CtCD_start  : ____________1__________1_____________
92        EndTag_start: _____________________________________
93        CD_end      : ___________________________________1_
94        DoubleHyphen: ______________1___1__________________
95        PI_end      : ________1____________________________
96        """
97
98        lex.PI_start = lex.LAngle_scope & lex.QMark
99        lex.CtCD_start = lex.LAngle_scope & lex.Exclam
100        lex.EndTag_start = lex.LAngle_scope & lex.Slash
101        lex.CD_end = bitutil.Advance(lex.RBracket_scope & lex.RBracket) & lex.RAngle
102        lex.DoubleHyphen = lex.Hyphen_scope & lex.Hyphen
103        lex.PI_end = lex.QMark_scope & lex.RAngle
104        return lex
105
106def demo_multiliterals(u8data):
107        lgth = len(u8data)
108        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
109        (u8, control, lex) = byteclass.classify_bytes(bit)
110        lex = add_multiliterals(lex)
111        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
112                              ('PI_start', bitutil.bitstream2string(lex.PI_start, lgth)),
113                              ('CtCD_start', bitutil.bitstream2string(lex.CtCD_start, lgth)),
114                              ('EndTag_start', bitutil.bitstream2string(lex.EndTag_start, lgth)),
115                              ('CD_end', bitutil.bitstream2string(lex.CD_end, lgth)),
116                              ('DoubleHyphen', bitutil.bitstream2string(lex.DoubleHyphen, lgth)),
117                              ('PI_end', bitutil.bitstream2string(lex.PI_end, lgth))])
118
119class CtCDPI_callouts:
120        CD_span = 0
121        Ct_span = 0
122        PI_mask = 0
123        CtCDPI_mask = 0
124        error = 0
125       
126def parse_CtCDPI(lex, EOF_mask):
127        """Parse all comments, CDATA sections and processing instructions.
128       
129        Return bitstreams marking the extent of these markup items,
130        excluding initial and final bracketting.
131       
132        >>> demo_CtCDPI(' <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]> ')
133        input data :  <?php?>  <!-- example -->  <![CDATA[  shift: a<<1 ]]>
134        CD_span    : ______________________________11111111111111111111111__
135        Ct_span    : _____________111111111111______________________________
136        PI_span    : __11111________________________________________________
137        CtCDPI_mask: __111111___111111111111111___1111111111111111111111111_
138        error      : ________________________________________________________
139       
140        Comments are terminated by double-hyphen; immediately require closing ">".
141       
142        >>> demo_CtCDPI(' <!--  <?php?>  --   <!-- -->')
143        input data :  <!--  <?php?>  --   <!-- -->
144        CD_span    : _____________________________
145        Ct_span    : ____11111111111111______1111_
146        PI_span    : _____________________________
147        CtCDPI_mask: __11111111111111111___1111111
148        error      : __________________1___________
149
150
151
152"""
153        callouts = CtCDPI_callouts()
154        PI_starts = 0
155        PI_ends = 0
156        Ct_starts = 0
157        Ct_ends = 0
158        CD_starts = 0
159        CD_ends = 0
160        CtCDPI_starts = 0
161        # Scanning streams
162        CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
163        Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
164        CD_end_scan = ~lex.CD_end & EOF_mask
165        PI_end_scan = ~lex.PI_end & EOF_mask
166        #
167        # Initiate the scan
168        CtCDPI_Cursor = 1
169        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
170        CtCDPI_Cursor &= EOF_mask
171        while CtCDPI_Cursor:
172                CtCDPI_starts |= CtCDPI_Cursor
173                PI_Cursor = CtCDPI_Cursor & lex.PI_start
174                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
175                CD_Cursor = CD_Ct_Cursor & lex.LBracket
176                Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
177                PI_starts |= PI_Cursor
178                CD_starts |= CD_Cursor
179                Ct_starts |= Ct_Cursor
180                Ct_Cursor = bitutil.Advance(Ct_Cursor)
181                Ct_end_scan |= Ct_Cursor
182                PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
183                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
184                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
185                PI_ends |= PI_Cursor
186                CD_ends |= CD_Cursor
187                Ct_ends |= Ct_Cursor
188                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
189                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
190                CtCDPI_Cursor &= EOF_mask
191        # End of loop: no remaining CtCDPI_Cursor
192        callouts.CD_span = CD_ends - CD_starts
193        callouts.Ct_span = Ct_ends - Ct_starts
194        callouts.PI_span = PI_ends - PI_starts
195       
196        callouts.CtCDPI_mask |= bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
197        callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
198        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
199        callouts.error |= callouts.CtCDPI_mask &~ EOF_mask
200        return callouts
201
202def demo_CtCDPI(u8data):
203        lgth = len(u8data)
204        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
205        (u8, control, lex) = byteclass.classify_bytes(bit)
206        lex = add_multiliterals(lex)
207        markup = parse_CtCDPI(lex, EOF_mask)
208        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
209                              ('CD_span', bitutil.bitstream2string(markup.CD_span, lgth)),
210                              ('Ct_span', bitutil.bitstream2string(markup.Ct_span, lgth)),
211                              ('PI_span', bitutil.bitstream2string(markup.PI_span, lgth)),
212                              ('CtCDPI_mask', bitutil.bitstream2string(markup.CtCDPI_mask, lgth)),
213                              ('error', bitutil.bitstream2string(markup.error, lgth+1))])
214
215
216class ref_callouts:
217        GenRefs = 0
218        DecRefs = 0
219        HexRefs = 0
220        delmask = 0
221        error = 0
222
223def parse_refs(lex, CtCDPI_mask):
224        """Parse and call out all general and character references.
225        Mark all but the closing semicolon for deletion.
226       
227        >>> demo_refs(" &gt;  &#13;  &#x0a;  ")
228        input data       :  &gt;  &#13;  &#x0a; 
229        entity refs      : __11__________________
230        decimal char refs: _________11___________
231        hex char refs    : _________________11___
232        ref delmask      : _111___1111___11111___
233        errors           : _______________________
234
235        Empty numeric references are reported as errors.
236        >>> demo_refs(" &#;       &#x; ")
237        input data       :  &#;       &#x;
238        entity refs      : ________________
239        decimal char refs: ________________
240        hex char refs    : ________________
241        ref delmask      : _11________111__
242        errors           : ___1__________1__
243
244        Improperly terminated or unterminated references (lacking ";") are also errors.
245        >>> demo_refs("  &gt:  &#456a;  &#xab:  &unterminated")
246        input data       :   &gt:  &#456a;  &#xab:  &unterminated
247        entity refs      : ___111____________________111111111111
248        decimal char refs: __________111_________________________
249        hex char refs    : ____________________11________________
250        ref delmask      : __1111__11111____11111___1111111111111
251        errors           : ______1______1________1_______________1
252"""
253        CallOuts = ref_callouts()
254        Ref2 = lex.RefStart_scope &~ CtCDPI_mask
255        NumRef2 = Ref2 & lex.Hash
256        GenRef2 = Ref2 &~ lex.Hash
257        NumRef3 = bitutil.Advance(NumRef2)
258        HexRef3 = NumRef3 & lex.x
259        DecRef3 = NumRef3 &~ lex.x
260        HexRef4 = bitutil.Advance(HexRef3) 
261        GenRefEnds = bitutil.ScanThru(GenRef2, lex.NameScan)
262        DecRefEnds = bitutil.ScanThru(DecRef3, lex.Digit)
263        HexRefEnds = bitutil.ScanThru(HexRef4, lex.Hex)
264        # Error checks
265        # At least one digit required for DecRef, one hex digit for HexRef.
266        Error = DecRef3 &~ lex.Digit
267        Error |= HexRef4 &~ lex.Hex
268        # Semicolon terminator required (also covers unterminated at EOF).
269        Error |= (GenRefEnds | DecRefEnds | HexRefEnds) &~ lex.Semicolon
270        # Not needed for xmlwf 
271        CallOuts.GenRefs = GenRefEnds - GenRef2
272        #CallOuts.DecRefs = DecRefEnds - DecRef3
273        #CallOuts.HexRefs = HexRefEnds - HexRef4
274        # Mark references for deletion, but leave the trailing semicolon as
275        # the point for insertion of the "expansion" text (most often a
276        # single character).
277        #CallOuts.delmask = (GenRefEnds | DecRefEnds | HexRefEnds) - lex.RefStart
278        CallOuts.error = Error
279        return CallOuts
280
281def demo_refs(u8data):
282        lgth = len(u8data)
283        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
284        (u8, control, lex) = byteclass.classify_bytes(bit)
285        callouts = parse_refs(lex, 0)
286        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
287                              ('entity refs', bitutil.bitstream2string(callouts.GenRefs, lgth)),
288                              ('decimal char refs', bitutil.bitstream2string(callouts.DecRefs, lgth)),
289                              ('hex char refs', bitutil.bitstream2string(callouts.HexRefs, lgth)),
290                              ('ref delmask', bitutil.bitstream2string(callouts.delmask, lgth)),
291                              ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
292
293
294class tag_callouts:
295        ElemNames = 0
296        AttNames = 0
297        AttVals = 0
298        Tags = 0
299        EmptyTagEnds = 0
300        EndTags = 0
301        error = 0
302       
303        # POTENTIAL ADDITIONAL FIELDS
304        # StartTagEnds = 0
305        # EmptyTagEnds = 0     
306        # EndTagEnds = 0
307
308def parse_tags(lex, CtCDPI_mask, EOF_mask):
309        """Parse start, empty and end tags, calling out element names, attribute
310        names and values, empty tag positions, and tag extents.
311
312        >>> demo_tags("<root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>")
313        input data      : <root><t1>text</t1><t2 a1='foo' a2 = 'fie'>more</t2><tag3 att3='b'/></root>
314        element names   : _1111__11___________11_______________________________1111__________________
315        attribute names : _______________________11_______11________________________1111_____________
316        attribute values: __________________________11111______11111_____________________111_________
317        empty tag marks : ___________________________________________________________________1_______
318        end tags        : _______________111______________________________111__________________11111_
319        start/empty tags: _1111__11___________1111111111111111111111___________11111111111111________
320        errors          : ____________________________________________________________________________
321
322        Attributes can use double quotes.
323
324        >>> demo_tags('<dquote_atts a1="1234" attribute2="4321"/>')
325        input data      : <dquote_atts a1="1234" attribute2="4321"/>
326        element names   : _11111111111______________________________
327        attribute names : _____________11________1111111111_________
328        attribute values: ________________111111____________111111__
329        empty tag marks : _________________________________________1
330        end tags        : __________________________________________
331        start/empty tags: _1111111111111111111111111111111111111111_
332        errors          : ___________________________________________
333
334        Syntax errors of various types are identified with the error stream.
335
336        1. Element name missing errors.
337
338        >>> demo_tags("< noname='flawed'/> ")
339        input data      : < noname='flawed'/>
340        element names   : ____________________
341        attribute names : __111111____________
342        attribute values: _________11111111___
343        empty tag marks : __________________1_
344        end tags        : ____________________
345        start/empty tags: _11111111111111111__
346        errors          : _1___________________
347
348        2. Missing attribute names.
349
350        >>> demo_tags("<noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>")
351        input data      : <noatt ='flawed'/>  <one_att a1='good' = 'bad'> oops </one_att>
352        element names   : _11111_______________1111111___________________________________
353        attribute names : _____________________________11________________________________
354        attribute values: ________11111111________________111111___11111_________________
355        empty tag marks : _________________1_____________________________________________
356        end tags        : ______________________________________________________11111111_
357        start/empty tags: _1111111111111111____1111111111111111111111111_________________
358        errors          : _______1_______________________________1________________________
359
360        3. Missing or incorrect = sign.
361
362        >>> demo_tags('<errata plusforeq+"5678" noequals"90" />')
363        input data      : <errata plusforeq+"5678" noequals"90" />
364        element names   : _111111_________________________________
365        attribute names : ________111111111________11111111_______
366        attribute values: __________________111111__________111111
367        empty tag marks : ________________________________________
368        end tags        : ________________________________________
369        start/empty tags: _111111111111111111111111111111111111111
370        errors          : _________________1_______________11______
371
372        4.  Missing whitespace
373
374        >>> demo_tags("<jammed att='value'att2='v2' />")
375        input data      : <jammed att='value'att2='v2' />
376        element names   : _111111________________________
377        attribute names : ________111________1111________
378        attribute values: ____________1111111_____1111___
379        empty tag marks : ______________________________1
380        end tags        : _______________________________
381        start/empty tags: _11111111111111111111111111111_
382        errors          : ___________________1____________
383
384        5.  Extra whitespace in an empty tag.
385
386        >>> demo_tags("<extrawhite / >")
387        input data      : <extrawhite / >
388        element names   : _1111111111____
389        attribute names : _______________
390        attribute values: _______________
391        empty tag marks : _____________1_
392        end tags        : _______________
393        start/empty tags: _111111111111__
394        errors          : _____________1__
395
396        6.  Unterminated or incorrectly terminated attribute values
397
398        >>> demo_tags("<badattvalues a='blud<   b='455>   ")
399        input data      : <badattvalues a='blud<   b='455>   
400        element names   : _111111111111______________________
401        attribute names : ______________1__________1_________
402        attribute values: ________________111111_____11111111
403        empty tag marks : ___________________________________
404        end tags        : ___________________________________
405        start/empty tags: _111111111111111111111_111111111111
406        errors          : _____________________11____________1
407
408        7.  Unterminated tags
409
410        >>> demo_tags("<unterminated a='245'  ")
411        input data      : <unterminated a='245' 
412        element names   : _111111111111__________
413        attribute names : ______________1________
414        attribute values: ________________11111__
415        empty tag marks : _______________________
416        end tags        : _______________________
417        start/empty tags: _1111111111111111111111
418        errors          : _______________________1
419
420"""
421        callouts = tag_callouts()
422       
423        # Delimiters for scans.
424        DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
425        SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
426        AttListDelim = lex.Slash | lex.RAngle
427       
428        # Start the parallel parsing by inspecting the character
429        # after the opening "<" of a tag.
430        LAngleFollow = lex.LAngle_scope &~ CtCDPI_mask
431        ElemNamePositions = LAngleFollow & ~lex.Slash
432        EndTagSeconds = LAngleFollow & lex.Slash
433       
434        # Start Tag/Empty Element Tag Parsing
435
436        # Advance all cursors by scanning through the tag name.
437        ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
438        # Must have at least one name character for a legal start tag.
439        # Mark any occurrences of null names as errors.
440        ParseError = ElemNamePositions & ElemNameFollows
441        callouts.ElemNames = ElemNameFollows - ElemNamePositions
442       
443        # Initialize the accumulators for attribute name and value positions.
444        AttNameStarts = 0 
445        AttNameFollows = 0
446        EqToCheck = 0
447        AttValStarts = 0
448        AttValEnds = 0
449        AttValFollows = 0
450
451        # After the element name, there may or may not be an attlist.
452        AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
453        AttListEnd = AfterWS & AttListDelim
454        AttNameStart = AfterWS & ~AttListDelim
455        # At least one WS character is required between ElemNames and AttNames.
456        ParseError |= ElemNameFollows & AttNameStart
457
458        #
459        # The following loop iterates through attributes within a start tag.
460        # Because all start tags are processed in parallel, the number of
461        # iterations is the maximum number of attributes found in any one
462        # start tag, plus one.
463        while AttNameStart:
464                AttNameStarts |= AttNameStart
465                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
466                AttNameFollows |= AttNameFollow
467                # Scan through WS to the expected '=' delimiter.
468                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
469                EqToCheck |= EqExpected
470                AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
471                AttValStarts |= AttValPos
472                DQuoteAttVal = AttValPos & lex.DQuote
473                SQuoteAttVal = AttValPos & lex.SQuote
474                DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
475                SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
476                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
477                AttValEnds |= AttValEnd
478                AttValFollow = bitutil.Advance(AttValEnd)
479                AttValFollows |= AttValFollow
480                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
481                AttListEnd |= AfterWS & AttListDelim
482                AttNameStart = AfterWS & ~AttListDelim
483
484        # No more attribute values to process when AttNameStart == 0.
485
486        callouts.AttNames = AttNameFollows - AttNameStarts
487        callouts.AttVals = AttValFollows - AttValStarts
488        STagEnds = AttListEnd & lex.RAngle
489        # Mark any "/" characters found as the ends of empty element tags.
490        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
491        callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
492
493        # Check for errors.
494        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
495        ParseError |= AttNameStarts & AttNameFollows # Null AttName
496        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
497        ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
498        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
499        ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
500
501        # End Tag Parsing
502        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
503        ParseError |= EndTagEnds & ~lex.RAngle
504        callouts.EndTags = EndTagEnds - EndTagSeconds
505        callouts.error = ParseError
506
507        # POTENTIAL ADDITIONAL FIELDS
508        # callouts.StartTagEnds = STagEnds
509        # callouts.EmptyTagEnds = bitutil.Advance(callouts.EmptyTagMarks)
510        # callouts.EndTagEnds = EndTagEnds
511       
512        return callouts
513
514def demo_tags(u8data):
515        lgth = len(u8data)
516        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
517        (u8, control, lex) = byteclass.classify_bytes(bit)
518        lex = add_multiliterals(lex)
519        markup1 = parse_CtCDPI(lex, EOF_mask)
520        callouts = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
521        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
522                              ('element names', bitutil.bitstream2string(callouts.ElemNames, lgth)),
523                              ('attribute names', bitutil.bitstream2string(callouts.AttNames, lgth)),
524                              ('attribute values', bitutil.bitstream2string(callouts.AttVals, lgth)),
525                              ('empty tag marks', bitutil.bitstream2string(callouts.EmptyTagMarks, lgth)),
526                              ('end tags', bitutil.bitstream2string(callouts.EndTags, lgth)),
527                              ('start/empty tags', bitutil.bitstream2string(callouts.Tags, lgth)),
528                              ('errors', bitutil.bitstream2string(callouts.error, lgth+1))])
529
530
531
532def validate_no_CD_end(lex, markup1, tags):
533        """Find illegal occurrences of ]]> in text (outside of markup).
534
535        >>> demo_validate_no_CD_end(' <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?> ')
536        input data :  <!-- OK: ]]>  --> <![CDATA OK  ]]>  ]]> <tag att=" ]]> "/> ]]>  <?php ]]> ?>
537        CtCDPI_mask: __1111111111111111__111111111111111_______________________________11111111111_
538        tags       : __________________________________________1111111111111111____________________
539        illegal ]]>: _______________________________________1______________________1_______________
540"""
541        ret = lex.CD_end & ~(markup1.CtCDPI_mask | tags.Tags)
542        return ret
543
544def demo_validate_no_CD_end(u8data):
545        lgth = len(u8data)
546        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
547        (u8, control, lex) = byteclass.classify_bytes(bit)
548        lex = add_multiliterals(lex)
549        markup1 = parse_CtCDPI(lex, EOF_mask)
550        tags = parse_tags(lex, markup1.CtCDPI_mask, EOF_mask)
551        error = validate_no_CD_end(lex, markup1, tags)
552        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
553                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
554                              ('tags', bitutil.bitstream2string(tags.Tags, lgth)),
555                              ('illegal ]]>', bitutil.bitstream2string(error, lgth))])
556
557
558
559def main(u8data):
560        # Transpose to parallel bit streams and prepare an EOF mask.
561        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
562
563        # Classify bytes for UTF-8 processing, whitespace and control
564        # processing and XML lexical analysis.
565        (u8, control, lex) = byteclass.classify_bytes(bit)
566        #(u8, control, lex) = byteclass.classify_bytes_with_shift1opt(bit)
567
568        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
569        u8 = u8u16.validate_utf8(u8)
570
571        # Rule out the illegal characters for XML.
572        xmlchar_error = validate_xmlchar(u8, control, lex, EOF_mask)
573
574        # Find and normalize bare CR or CRLF combinations.
575        # Not needed for xmlwf
576        #(control, bit) = normalize_line_breaks(control, bit)
577
578        # Compute XML multilterals such as <?, </, --, ]]>.
579        lex = add_multiliterals(lex)
580       
581       
582        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
583        # Parse all comments, CDATA sections and processing instructions.
584        #markup1 = parse_CtCDPI(lex, EOF_mask)
585        #CT_callouts = CtCDPI_callouts()
586        PI_starts = 0
587        PI_ends = 0
588        Ct_starts = 0
589        Ct_ends = 0
590        CD_starts = 0
591        CD_ends = 0
592        CtCDPI_starts = 0
593        PI_name_ends = 0
594        # Scanning streams
595        CtCDPI_scan = ~(lex.CtCD_start | lex.PI_start) & EOF_mask
596        Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
597        CD_end_scan = ~lex.CD_end & EOF_mask
598        PI_end_scan = ~lex.PI_end & EOF_mask
599        #
600        # Initiate the scan
601        CtCDPI_Cursor = 1
602        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
603        CtCDPI_Cursor &= EOF_mask
604        while CtCDPI_Cursor:
605                CtCDPI_starts |= CtCDPI_Cursor
606                PI_Cursor = CtCDPI_Cursor & lex.PI_start
607                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
608                CD_Cursor = CD_Ct_Cursor & lex.LBracket
609                Ct_Cursor = bitutil.Advance(CD_Ct_Cursor & lex.Hyphen) 
610                PI_starts |= PI_Cursor
611                CD_starts |= CD_Cursor
612                Ct_starts |= Ct_Cursor
613                Ct_Cursor = bitutil.Advance(Ct_Cursor)
614                Ct_end_scan |= Ct_Cursor
615                #PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
616                PI_name_end = bitutil.ScanThru( bitutil.Advance(PI_Cursor), lex.NameScan)
617                PI_name_ends |= PI_name_end
618                CT_callouts.PI_name |= PI_name_end - PI_Cursor
619                PI_Cursor = bitutil.ScanThru(PI_name_end, PI_end_scan)
620                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
621                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
622                PI_ends |= PI_Cursor
623                CD_ends |= CD_Cursor
624                Ct_ends |= Ct_Cursor
625                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
626                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
627                CtCDPI_Cursor &= EOF_mask
628        # End of loop: no remaining CtCDPI_Cursor
629        #Not needed for xmlwf
630        #CT_callouts.CD_span = CD_ends - CD_starts
631        #CT_callouts.Ct_span = Ct_ends - Ct_starts
632        #CT_callouts.PI_span = PI_ends - PI_starts
633       
634        CT_callouts.CtCDPI_mask = bitutil.Advance(CD_ends | Ct_ends | PI_ends) - CtCDPI_starts
635        CT_callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
636        CT_callouts.error |= bitutil.Advance(PI_name_ends & ~ lex.WS) & ~ lex.PI_end
637        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
638        CT_callouts.error |= CT_callouts.CtCDPI_mask &~ EOF_mask
639        ########## END OF MANUAL INLINING
640       
641        # THE FOLLOWING FUNCTIONAL CALL IS MANUALLY INLINED
642        # All remaining "<" must be tag start characters; parse tags.
643        #tags = parse_tags(lex, CT_callouts.CtCDPI_mask, EOF_mask)
644
645        #callouts = tag_callouts()
646       
647        # Delimiters for scans.
648        DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
649        SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
650        AttListDelim = lex.Slash | lex.RAngle
651       
652        # Start the parallel parsing by inspecting the character
653        # after the opening "<" of a tag.
654        LAngleFollow = bitutil.Advance(lex.LAngle) &~ CT_callouts.CtCDPI_mask
655        ElemNamePositions = LAngleFollow & ~lex.Slash
656        EndTagSeconds = LAngleFollow & lex.Slash
657       
658        # Start Tag/Empty Element Tag Parsing
659
660        # Advance all cursors by scanning through the tag name.
661        ElemNameFollows = bitutil.ScanThru(ElemNamePositions, lex.NameScan)
662        # Must have at least one name character for a legal start tag.
663        # Mark any occurrences of null names as errors.
664        ParseError = ElemNamePositions & ElemNameFollows
665        callouts.ElemNames = ElemNameFollows - ElemNamePositions
666       
667        # Initialize the accumulators for attribute name and value positions.
668        AttNameStarts = 0 
669        AttNameFollows = 0
670        EqToCheck = 0
671        AttValStarts = 0
672        AttValEnds = 0
673        AttValFollows = 0
674
675        # After the element name, there may or may not be an attlist.
676        AfterWS = bitutil.ScanThru(ElemNameFollows, lex.WS)
677        AttListEnd = AfterWS & AttListDelim
678        AttNameStart = AfterWS & ~AttListDelim
679        # At least one WS character is required between ElemNames and AttNames.
680        ParseError |= ElemNameFollows & AttNameStart
681
682        #
683        # The following loop iterates through attributes within a start tag.
684        # Because all start tags are processed in parallel, the number of
685        # iterations is the maximum number of attributes found in any one
686        # start tag, plus one.
687        while AttNameStart:
688                AttNameStarts |= AttNameStart
689                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
690                AttNameFollows |= AttNameFollow
691                # Scan through WS to the expected '=' delimiter.
692                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
693                EqToCheck |= EqExpected
694                AttValPos = bitutil.ScanThru(bitutil.Advance(EqExpected), lex.WS)
695                AttValStarts |= AttValPos
696                DQuoteAttVal = AttValPos & lex.DQuote
697                SQuoteAttVal = AttValPos & lex.SQuote
698                DQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(DQuoteAttVal), DQuoteScan)
699                SQuoteAttEnd = bitutil.ScanThru(bitutil.Advance(SQuoteAttVal), SQuoteScan)
700                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
701                AttValEnds |= AttValEnd
702                AttValFollow = bitutil.Advance(AttValEnd)
703                AttValFollows |= AttValFollow
704                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
705                AttListEnd |= AfterWS & AttListDelim
706                AttNameStart = AfterWS & ~AttListDelim
707
708        # No more attribute values to process when AttNameStart == 0.
709        # Not needed for xmlwf
710        callouts.AttNames = AttNameFollows - AttNameStarts
711        #callouts.AttVals = AttValFollows - AttValStarts
712        STagEnds = AttListEnd & lex.RAngle
713        # Mark any "/" characters found as the ends of empty element tags.
714        callouts.EmptyTagMarks = bitutil.Advance(AttListEnd & lex.Slash)
715        # Not needed for xmlwf
716        #callouts.Tags = (STagEnds | callouts.EmptyTagMarks) - ElemNamePositions
717       
718        # Check for errors.
719        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
720        ParseError |= AttNameStarts & AttNameFollows # Null AttName
721        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
722        ParseError |= AttValStarts & ~ (lex.DQuote | lex.SQuote)
723        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
724        ParseError |= callouts.EmptyTagMarks & ~lex.RAngle
725
726        # End Tag Parsing
727        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(bitutil.Advance(EndTagSeconds), lex.NameScan), lex.WS)
728        ParseError |= EndTagEnds & ~lex.RAngle
729        # Not needed for xmlwf
730        #callouts.EndTags = EndTagEnds - EndTagSeconds
731        callouts.error = ParseError
732        ########## END OF MANUAL INLINING
733
734
735
736
737        # All remaining "&" must be reference start characters; parse them.
738        refs = parse_refs(lex, CT_callouts.CtCDPI_mask)
739
740        # Ensure that no occurrence of ]]> occurs outside of markup.
741        CD_end_error = validate_no_CD_end(lex, CT_callouts, callouts)
742
743        # Convert to UTF-16 bit streams.
744        #(u16hi, u16lo, u16delmask) = u8u16.u8u16(u8, bit)
745
746        # Consolidate and check for errors
747        error_mask = u8.error | xmlchar_error | CT_callouts.error | callouts.error | CD_end_error | refs.error
748
749        # Consolidate the deletion_masks
750        #delmask = control.CRLF | refs.delmask | u16delmask # | CT_callouts.CDATA_delimiters
751        #Not needed for xmlwf
752        delmask = control.CRLF | refs.delmask  # | CT_callouts.CDATA_delimiters
753       
754        qname_stream =  callouts.ElemNames | callouts.AttNames
755        ncname_stream = CT_callouts.PI_name | refs.GenRefs
756        name_stream = qname_stream | ncname_stream
757        name_start = name_stream &~ bitutil.Advance(name_stream)
758        name_start_check = name_start & ~lex.ASCII_name_start
759        name_check = (name_stream &~ name_start | nmtoken_stream) & ~lex.ASCII_name_char & ~u8.suffix
760
761        #return (CT_callouts, callouts, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask)
762        return (CT_callouts, callouts, refs, delmask, error, lex, EOF_mask, name_check, name_start_check, control)
763
764def demo_parabix(u8data):
765
766        lgth = len(u8data)
767       
768        (markup1, tags, refs, u16hi, u16lo, delmask, error, lex, u16delmask, EOF_mask) = parabix_parse(u8data)
769        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
770                              ('input high nybbles', bitutil.high_nybble_stream(u8data)), 
771                              ('input low nybbles', bitutil.low_nybble_stream(u8data)),
772                              ('CD_span', bitutil.bitstream2string(markup1.CD_span, lgth)),
773                              ('Ct_span', bitutil.bitstream2string(markup1.Ct_span, lgth)),
774                              ('PI_span', bitutil.bitstream2string(markup1.PI_span, lgth)),
775                              ('CtCDPI_mask', bitutil.bitstream2string(markup1.CtCDPI_mask, lgth)),
776                              ('entity refs', bitutil.bitstream2string(refs.GenRefs, lgth)),
777                              ('decimal char refs', bitutil.bitstream2string(refs.DecRefs, lgth)),
778                              ('hex char refs', bitutil.bitstream2string(refs.HexRefs, lgth)),
779                              ('element names', bitutil.bitstream2string(tags.ElemNames, lgth)),
780                              ('attribute names', bitutil.bitstream2string(tags.AttNames, lgth)),
781                              ('attribute values', bitutil.bitstream2string(tags.AttVals, lgth)),
782                              ('empty tag marks', bitutil.bitstream2string(tags.EmptyTagMarks, lgth)),
783                              ('end tags', bitutil.bitstream2string(tags.EndTags, lgth)),
784                              ('start/empty tags', bitutil.bitstream2string(tags.Tags, lgth)),
785                              ('delmask', bitutil.bitstream2string(delmask, lgth)),
786                              ('u16delmask', bitutil.bitstream2string(u16delmask, lgth)),
787                              ('errors', bitutil.bitstream2string(error, lgth+1))])
788
789def demo_u16delmask(u8data):
790
791        u8len = len(u8data)
792       
793        # Transpose to parallel bit streams and prepare an EOF mask.
794        (bit, EOF_mask) = bitutil.transpose_streams(u8data)
795
796        # Classify bytes for UTF-8 processing, whitespace and control
797        # processing and XML lexical analysis.
798        (u8, control, lex) = byteclass.classify_bytes(bit)
799
800        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
801        u8 = u8u16.validate_utf8(u8)   
802       
803        # Convert to UTF-16 bit streams.
804        (u16hi, u16lo, delmask) = u8u16.u8u16(u8, bit)
805       
806        # Inverse transpose
807        U16H = bitutil.filter_bytes(bitutil.inverse_transpose(u16hi, u8len), delmask)
808        U16L = bitutil.filter_bytes(bitutil.inverse_transpose(u16lo, u8len), delmask)
809       
810        # Construct UTF-16 data buffer
811        bytes = bitutil.merge_bytes(U16L, U16H)
812       
813        U16data = bytes.decode('utf16')
814       
815        bitutil.print_aligned_u8_byte_streams([('input data', u8data), 
816                                ('u16delmask', bitutil.bitstream2string(delmask, u8len)),               
817                                    ('errors', bitutil.bitstream2string(u8.error, u8len+1))])
818        return
819
820if __name__ == "__main__":
821        import doctest
822        doctest.testmod()
823       
824        if len(sys.argv) > 1:
825                u8data = bitutil.readfile(sys.argv[1]) 
826#               demo_validate_xmlchar(u8data)
827#               demo_line_breaks(u8data)
828#               demo_multiliterals(u8data)
829#               demo_CtCDPI(u8data)
830#               demo_refs(u8data)
831#               demo_tags(u8data)
832#               demo_validate_no_CD_end(u8data)         
833#               demo_u16delmask(u8data)         
834                demo_parabix(u8data)
835#               demo_u16delmask(u8data)
836        else:
837                print("Usage: python parabix2.py <file>")       
838               
839 
840       
841       
Note: See TracBrowser for help on using the repository browser.