source: proto/pabloj/trunk/input/test/pabloS/proto/parabix2_pablo_r2797.py @ 3235

Last change on this file since 3235 was 3235, checked in by ksherdy, 6 years ago

Minor fix. Suppressed warnings.

File size: 21.0 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# parabix2_compilable.py
4#
5# Parallel XML Parsing with Bitstream Addition
6#
7# - Complete prototype for all bitstream computations in Parabix2
8# - Optimized for compilation
9# - Separate compilation
10
11# Robert D. Cameron
12# July 29, 2010
13#
14
15#import bitutil
16
17class Basis_bits():   
18    bit_0 = 0
19    bit_1 = 0
20    bit_2 = 0
21    bit_3 = 0
22    bit_4 = 0
23    bit_5 = 0
24    bit_6 = 0
25    bit_7 = 0
26
27
28class U8 ():
29  unibyte = 0
30  prefix = 0
31  prefix2 = 0
32  prefix3 = 0
33  prefix4 = 0
34  suffix = 0
35  badprefix = 0
36  xE0 = 0
37  xED = 0
38  xF0 = 0
39  xF4 = 0
40  xA0_xBF = 0
41  x80_x9F = 0
42  x90_xBF = 0
43  x80_x8F = 0
44  xEF = 0
45  xBF = 0
46  xBE = 0
47  scope22 = 0
48  scope32 = 0
49  scope33 = 0
50  scope42 = 0
51  scope43 = 0
52  scope44 = 0
53  xE0_scope = 0
54  xED_scope = 0
55  xF0_scope = 0
56  xF4_scope = 0
57  xEF_scope = 0
58
59class Lex ():
60    CR = 0
61    LF = 0
62    HT = 0
63    SP = 0
64    CRLF = 0
65    RefStart = 0
66    Semicolon = 0 
67    Colon = 0
68    LAngle = 0
69    RAngle = 0
70    LBracket = 0
71    RBracket = 0
72    Exclam = 0
73    QMark = 0
74    Hyphen = 0
75    Equals = 0
76    SQuote = 0
77    DQuote = 0
78    Slash = 0
79    Hash = 0
80    x = 0
81    ASCII_name_start = 0
82    ASCII_name_char = 0
83    NameScan = 0
84    Digit = 0
85    Hex = 0
86    WS = 0
87
88class Marker ():
89    LAngle_scope = 0
90    Ref_opener = 0
91    CD_closer = 0
92
93class CtCDPI_Callouts():
94    Ct_starts = 0
95    Ct_ends = 0
96    CD_starts = 0
97    CD_ends = 0
98    PI_starts = 0
99    PI_name_starts = 0
100    PI_name_ends = 0
101    PI_ends = 0
102
103class Ref_Callouts():
104    GenRef_starts = 0
105    GenRef_ends = 0
106    DecRef_starts = 0
107    DecRef_ends = 0
108    HexRef_starts = 0
109    HexRef_ends = 0
110
111class Tag_Callouts():
112    ElemName_starts = 0
113    ElemName_ends = 0
114    AttName_starts = 0
115    AttName_ends = 0
116    AttVal_starts = 0
117    AttVal_ends = 0
118    AttVal_spans = 0
119    EmptyTag_marks = 0
120    EndTag_marks = 0
121   
122class Check_streams():
123    misc_mask = 0
124    non_ascii_name_starts = 0
125    non_ascii_names = 0
126    tag_marks = 0 
127    name_follows = 0 
128    att_refs = 0 
129
130def Classify_bytes_Validate_utf8(basis_bits, lex, u8): 
131    temp1 = (basis_bits.bit_0 | basis_bits.bit_1);
132    temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3);
133    temp3 = (temp2 &~ temp1);
134    temp4 = (basis_bits.bit_5 &~ basis_bits.bit_4);
135    temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7);
136    temp6 = (temp4 & temp5);
137    lex.RefStart = (temp3 & temp6);
138    temp7 = (basis_bits.bit_2 & basis_bits.bit_3);
139    temp8 = (temp7 &~ temp1);
140    temp9 = (basis_bits.bit_4 &~ basis_bits.bit_5);
141    temp10 = (basis_bits.bit_6 & basis_bits.bit_7);
142    temp11 = (temp9 & temp10);
143    lex.Semicolon = (temp8 & temp11);
144    temp12 = (basis_bits.bit_4 & basis_bits.bit_5);
145    temp13 = (basis_bits.bit_6 | basis_bits.bit_7);
146    temp14 = (temp12 &~ temp13);
147    lex.LAngle = (temp8 & temp14);
148    temp15 = (temp12 & temp5);
149    lex.RAngle = (temp8 & temp15);
150    temp16 = (basis_bits.bit_1 &~ basis_bits.bit_0);
151    temp17 = (basis_bits.bit_3 &~ basis_bits.bit_2);
152    temp18 = (temp16 & temp17);
153    lex.LBracket = (temp18 & temp11);
154    temp19 = (basis_bits.bit_7 &~ basis_bits.bit_6);
155    temp20 = (temp12 & temp19);
156    lex.RBracket = (temp18 & temp20);
157    temp21 = (basis_bits.bit_4 | basis_bits.bit_5);
158    temp22 = (temp19 &~ temp21);
159    lex.Exclam = (temp3 & temp22);
160    temp23 = (temp12 & temp10);
161    lex.QMark = (temp8 & temp23);
162    lex.Hyphen = (temp3 & temp20);
163    lex.Equals = (temp8 & temp20);
164    temp24 = (temp4 & temp10);
165    lex.SQuote = (temp3 & temp24);
166    temp25 = (temp5 &~ temp21);
167    lex.DQuote = (temp3 & temp25);
168    lex.Slash = (temp3 & temp23);
169    temp26 = (temp10 &~ temp21);
170    lex.Hash = (temp3 & temp26);
171    temp27 = (temp16 & temp7);
172    temp28 = (temp9 &~ temp13);
173    lex.x = (temp27 & temp28);
174    temp29 = (temp9 & temp5);
175    lex.Colon = (temp8 & temp29);
176    temp30 = (temp18 & temp23);
177    temp31 = (temp30 | lex.Colon);
178    temp32 = (temp16 &~ basis_bits.bit_2);
179    temp33 = (basis_bits.bit_5 | temp10);
180    temp34 = (basis_bits.bit_4 & temp33);
181    temp35 = (~temp34);
182    temp36 = (temp21 | temp13);
183    temp37 = ((basis_bits.bit_3 & temp35)|(~(basis_bits.bit_3) & temp36));
184    temp38 = (temp32 & temp37);
185    temp39 = (temp31 | temp38);
186    temp40 = (temp16 & basis_bits.bit_2);
187    temp41 = (temp40 & temp37);
188    lex.ASCII_name_start = (temp39 | temp41);
189    temp42 = (temp30 | lex.Hyphen);
190    temp43 = (temp3 & temp15);
191    temp44 = (temp42 | temp43);
192    temp45 = (temp8 &~ temp34);
193    temp46 = (temp44 | temp45);
194    temp47 = (temp46 | temp38);
195    lex.ASCII_name_char = (temp47 | temp41);
196    lex.NameScan = (lex.ASCII_name_char | basis_bits.bit_0);
197    temp48 = (temp1 | basis_bits.bit_2);
198    x00_x1F = (~temp48);
199    temp49 = (basis_bits.bit_2 | basis_bits.bit_3);
200    temp50 = (temp1 | temp49);
201    lex.CR = (temp20 &~ temp50);
202    lex.LF = (temp29 &~ temp50);
203    temp51 = (temp9 & temp19);
204    lex.HT = (temp51 &~ temp50);
205    lex.SP = (temp3 &~ temp36);
206    temp52 = (temp20 | temp29);
207    temp53 = (temp52 | temp51);
208    temp54 = (temp53 &~ temp50);
209    lex.WS = (temp54 | lex.SP);
210    temp55 = (basis_bits.bit_5 | basis_bits.bit_6);
211    temp56 = (basis_bits.bit_4 & temp55);
212    lex.Digit = (temp8 &~ temp56);
213    temp57 = (temp16 &~ temp49);
214    temp58 = (temp57 &~ basis_bits.bit_4);
215    temp59 = (~temp10);
216    temp60 = ((basis_bits.bit_5 & temp59)|(~(basis_bits.bit_5) & temp13));
217    temp61 = (temp58 & temp60);
218    temp62 = (lex.Digit | temp61);
219    temp63 = (temp16 & temp2);
220    temp64 = (temp63 &~ basis_bits.bit_4);
221    temp65 = (temp64 & temp60);
222    lex.Hex = (temp62 | temp65);
223    lex_error = x00_x1F &~ lex.WS
224    pablo.assert_0(pablo.inFile(lex_error), "Error: illegal character")
225       
226   
227    ### Validate_utf8(basis_bits, u8):
228    u8.unibyte = (~basis_bits.bit_0);
229    u8.suffix = 0
230    u8_error = 0
231    u8_FFFE_FFFF = 0
232    u8anyscope = 0 #local
233    if basis_bits.bit_0:
234        u8.prefix = (basis_bits.bit_0 & basis_bits.bit_1);
235        u8.prefix2 = (u8.prefix &~ basis_bits.bit_2);
236        u8.prefix3 = (u8.prefix & temp2);
237        u8.prefix4 = (u8.prefix & temp7);
238        u8.suffix = (basis_bits.bit_0 &~ basis_bits.bit_1);
239        temp66 = (u8.prefix &~ temp49);
240        temp67 = (temp21 | basis_bits.bit_6);
241        temp68 = (temp66 &~ temp67);
242        temp69 = (basis_bits.bit_5 & temp13);
243        temp70 = (basis_bits.bit_4 | temp69);
244        temp71 = (u8.prefix4 & temp70);
245        u8.badprefix = (temp68 | temp71);
246        u8_error = u8.badprefix
247        u8.scope22 = pablo.Advance(u8.prefix2)
248        u8anyscope = u8.scope22
249        if u8.prefix3 | u8.prefix4:
250            xE0 = (u8.prefix3 &~ temp36);
251            xED = (u8.prefix3 & temp20);
252            xF0 = (u8.prefix4 &~ temp36);
253            temp72 = (temp4 &~ temp13);
254            xF4 = (u8.prefix4 & temp72);
255            u8.xA0_xBF = (u8.suffix & basis_bits.bit_2);
256            u8.x80_x9F = (u8.suffix &~ basis_bits.bit_2);
257            u8.x90_xBF = (u8.suffix & temp49);
258            u8.x80_x8F = (u8.suffix &~ temp49);
259            xEF = (u8.prefix3 & temp23);
260            temp73 = (u8.suffix & temp7);
261            u8.xBF = (temp73 & temp23);
262            u8.xBE = (temp73 & temp15);
263#
264
265#            scope3_32 = pablo.Advance32(u8.prefix3)
266#            scope4_32 = pablo.Advance32(u8.prefix4)
267#            u8.scope32 = interpose32(u8.prefix3, scope3_32, 1)
268#            u8.scope33 = interpose32(u8.prefix3, scope3_32, 2)
269#            u8.scope42 = interpose32(u8.prefix4, scope4_32, 1)
270#            u8.scope43 = interpose32(u8.prefix4, scope4_32, 2)
271#            u8.scope44 = interpose32(u8.prefix4, scope4_32, 3)
272#
273            u8.scope32 = pablo.Advance(u8.prefix3)
274            u8.scope33 = pablo.Advance(u8.scope32)
275            u8.scope42 = pablo.Advance(u8.prefix4)
276            u8.scope43 = pablo.Advance(u8.scope42)
277            u8.scope44 = pablo.Advance(u8.scope43)
278#
279#            u8.xE0_scope = pablo.Advance(xE0);
280#            u8.xED_scope = pablo.Advance(xED);
281#            u8.xF0_scope = pablo.Advance(xF0);
282#            u8.xF4_scope = pablo.Advance(xF4);
283            E0_F0_scope = pablo.Advance(xE0 | xF0)
284            ED_F4_scope = pablo.Advance(xED | xF4)
285            u8.xE0_scope = u8.scope32 & E0_F0_scope
286            u8.xED_scope = u8.scope32 & ED_F4_scope
287            u8.xF0_scope = u8.scope42 & E0_F0_scope
288            u8.xF4_scope = u8.scope42 & ED_F4_scope
289            u8.xEF_scope = pablo.Advance(xEF);
290
291            u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
292            u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43
293       
294            u8error1 = u8.xE0_scope & u8.x80_x9F
295            u8error2 = u8.xED_scope & u8.xA0_xBF
296            u8error3 = u8.xF0_scope & u8.x80_x8F
297            u8error4 = u8.xF4_scope & u8.x90_xBF
298   
299            u8_error |= u8error1 | u8error2 | u8error3 | u8error4
300
301            EF_BF_pending = pablo.Advance(u8.xEF_scope & u8.xBF)
302
303            u8_FFFE_FFFF = (EF_BF_pending & (u8.xBE | u8.xBF))
304        u8mismatch = u8anyscope ^ u8.suffix
305        pablo.assert_0(u8_error | u8mismatch | u8_FFFE_FFFF, "UTF-8 error found")
306           
307   
308def Parse_CtCDPI(lex, marker, ctCDPI_Callouts, check_streams):
309    ctCDPI_Callouts.Ct_starts = 0
310    ctCDPI_Callouts.Ct_ends = 0
311    ctCDPI_Callouts.CD_starts = 0
312    ctCDPI_Callouts.CD_ends = 0
313    ctCDPI_Callouts.PI_starts = 0
314    ctCDPI_Callouts.PI_name_starts = 0
315    ctCDPI_Callouts.PI_name_ends = 0
316    ctCDPI_Callouts.PI_ends = 0
317    CtCDPI_starts = 0
318    CtCDPI_ends = 0
319    ctCDPI_mask = 0
320
321    v = lex.LAngle | lex.Hyphen
322    w = lex.Hyphen | lex.QMark
323    v1 = pablo.Advance(v,1)
324    w1 = pablo.Advance(w,1)
325   
326    LAngle_scope = v1 &~ w1  #pablo.Advance(lex.LAngle)
327    PI_opener = LAngle_scope & lex.QMark
328    CtCD_opener= LAngle_scope & lex.Exclam
329    CtCDPI_opener = PI_opener | CtCD_opener
330
331    #DoubleHyphen = 0
332    CD_closer = 0
333    #PI_closer = 0
334   
335    #if lex.Hyphen: DoubleHyphen = pablo.Advance(lex.Hyphen) & lex.Hyphen
336    DoubleHyphen = v1 & w1 & lex.Hyphen
337    if lex.RBracket:
338        DoubleRBracket = pablo.Advance(lex.RBracket) & lex.RBracket
339        CD_closer = pablo.Advance(DoubleRBracket) & lex.RAngle
340    #if lex.QMark: PI_closer = pablo.Advance(lex.QMark) & lex.RAngle
341    PI_closer = w1 & ~v1 & lex.RAngle
342
343    #
344    # Initiate the scan
345    CtCDPI_Cursor = pablo.ScanToFirst(CtCDPI_opener)
346    while CtCDPI_Cursor:
347        CtCDPI_starts |= CtCDPI_Cursor
348        PI_Cursor = CtCDPI_Cursor & PI_opener
349        CD_Ct_Cursor = pablo.Advance(CtCDPI_Cursor & ~PI_Cursor)
350        CD_Cursor = CD_Ct_Cursor & lex.LBracket
351        Ct_Cursor = CD_Ct_Cursor & lex.Hyphen
352        # PI processing
353        if PI_Cursor:
354            ctCDPI_Callouts.PI_starts |= PI_Cursor
355            PI_Cursor = pablo.Advance(PI_Cursor)
356            ctCDPI_Callouts.PI_name_starts |= PI_Cursor
357            PI_name_end = pablo.ScanThru(PI_Cursor, lex.NameScan)
358            PI_error = PI_Cursor & PI_name_end
359            PI_noWS = PI_name_end & ~ lex.WS
360            PI_error |= PI_noWS &~ lex.QMark | pablo.Advance(PI_noWS) &~ PI_closer
361            pablo.assert_0(PI_error, "Error in PI syntax")
362            ctCDPI_Callouts.PI_name_ends |= PI_name_end
363            PI_Cursor = pablo.ScanTo(PI_name_end, PI_closer)
364            ctCDPI_Callouts.PI_ends |= PI_Cursor
365            CtCDPI_ends |= PI_Cursor
366
367        # CDATA section processing
368        if CD_Cursor:
369            ctCDPI_Callouts.CD_starts |= CD_Cursor
370            CD_Cursor = pablo.ScanTo(CD_Cursor, CD_closer)
371            ctCDPI_Callouts.CD_ends |= CD_Cursor
372            CtCDPI_ends |= CD_Cursor
373
374        # Comment processing
375        if Ct_Cursor:
376            ctCDPI_Callouts.Ct_starts |= Ct_Cursor
377            Ct_Cursor = pablo.Advance(Ct_Cursor) 
378            Ct_error = Ct_Cursor & ~ lex.Hyphen
379            # Advance twice past <!--, so that we don't treat <!---
380                # as being a terminated comment.
381            Ct_Cursor = pablo.Advance(pablo.Advance(Ct_Cursor))
382            Ct_Cursor = pablo.Advance(pablo.ScanTo(Ct_Cursor, DoubleHyphen))
383            pablo.assert_0(Ct_error | Ct_Cursor & ~ lex.RAngle, "Error in comment syntax")
384            ctCDPI_Callouts.Ct_ends |= Ct_Cursor
385            CtCDPI_ends |= Ct_Cursor
386
387        # Common processing
388        CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
389        ctCDPI_mask = pablo.InclusiveSpan(CtCDPI_starts, CtCDPI_ends)
390#        ctCDPI_mask |= (CtCDPI_ends - CtCDPI_starts) | CtCDPI_ends
391        # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
392        pablo.assert_0(pablo.atEOF(ctCDPI_mask), "Error in comment, CDATA or processing instruction syntax")
393        CtCDPI_Cursor = pablo.ScanTo(CtCDPI_Cursor, CtCDPI_opener)   
394       
395#    check_streams.misc_mask = (lex.WS | lex.LAngle | (pablo.Advance(ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends)  -(ctCDPI_Callouts.Ct_starts | ctCDPI_Callouts.PI_starts)) | CtCDPI_starts) & EOF_mask
396#  Following is slow
397    check_streams.misc_mask = (lex.WS | lex.LAngle | pablo.InclusiveSpan(ctCDPI_Callouts.Ct_starts | ctCDPI_Callouts.PI_starts, ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) | CtCDPI_starts) & EOF_mask
398
399
400    # Identify the remaining significant markers for XML processing.
401    marker.LAngle_scope = LAngle_scope &~ ctCDPI_mask
402    marker.Ref_opener = lex.RefStart &~ ctCDPI_mask
403    marker.CD_closer = CD_closer &~ ctCDPI_mask
404
405def Parse_tags(lex, marker, tag_Callouts):
406   
407    EqExpected = 0
408    AttListEnd = 0
409   
410    # Delimiters for scans.
411    DQuoteDelim = lex.DQuote | lex.LAngle
412    SQuoteDelim = lex.SQuote | lex.LAngle
413    AttListDelim = lex.Slash | lex.RAngle
414   
415    # Start the parallel parsing by inspecting the character
416    # after the opening "<" of a tag.
417    tag_Callouts.ElemName_starts = marker.LAngle_scope & ~lex.Slash
418    tag_Callouts.EndTag_marks = marker.LAngle_scope & lex.Slash
419   
420    # Start Tag/Empty Element Tag Parsing
421
422    # Advance all cursors by scanning through the tag name.
423    tag_Callouts.ElemName_ends = pablo.ScanThru(tag_Callouts.ElemName_starts, lex.NameScan)
424    # Must have at least one name character for a legal start tag.
425    # Mark any occurrences of null names as errors.
426    ParseError = tag_Callouts.ElemName_starts & tag_Callouts.ElemName_ends
427   
428    # Initialize the accumulators for attribute name and value positions.
429    tag_Callouts.AttName_starts = 0 
430    tag_Callouts.AttName_ends = 0
431    tag_Callouts.AttVal_starts = 0
432    tag_Callouts.AttVal_ends = 0
433    # After the element name, there may or may not be an attlist.
434    if tag_Callouts.ElemName_ends & lex.WS:
435        AfterWS = pablo.ScanThru(tag_Callouts.ElemName_ends, lex.WS)
436        AttListEnd = AfterWS & AttListDelim
437        AttNameStart = AfterWS & ~AttListDelim
438        #
439        # The following loop iterates through attributes within a start tag.
440        # Because all start tags are processed in parallel, the number of
441        # iterations is the maximum number of attributes found in any one
442        # start tag, plus one.
443        while AttNameStart:
444            ParseError |= AttNameStart &~ lex.NameScan
445            tag_Callouts.AttName_starts |= AttNameStart
446            AttNameFollow = pablo.ScanThru(AttNameStart, lex.NameScan)
447            tag_Callouts.AttName_ends |= AttNameFollow
448            # Scan through WS to the expected '=' delimiter.
449            # EqExpected = pablo.ScanThru(AttNameFollow, lex.WS)
450            # But use if test to optimize.
451            if AttNameFollow & lex.WS: 
452                EqExpected = pablo.ScanThru(AttNameFollow, lex.WS)
453            else: EqExpected = AttNameFollow
454            ParseError |= EqExpected &~ lex.Equals
455            AttValPos = pablo.AdvanceThenScanThru(EqExpected, lex.WS)
456#            AttValPos = pablo.ScanThru(EqExpected, EqExpected | lex.WS)
457            tag_Callouts.AttVal_starts |= AttValPos
458            DQuoteAttVal = AttValPos & lex.DQuote
459            SQuoteAttVal = AttValPos & lex.SQuote
460#            DQuoteAttEnd = pablo.ScanTo(DQuoteAttVal, DQuoteDelim &~ DQuoteAttVal)
461#            SQuoteAttEnd = pablo.ScanTo(SQuoteAttVal, SQuoteDelim &~ SQuoteAttVal)
462            DQuoteAttEnd = pablo.AdvanceThenScanTo(DQuoteAttVal, DQuoteDelim)
463            SQuoteAttEnd = pablo.AdvanceThenScanTo(SQuoteAttVal, SQuoteDelim)
464            AttValEnd = DQuoteAttEnd | SQuoteAttEnd
465            ParseError |= (AttValPos | AttValEnd) &~ (lex.DQuote | lex.SQuote)
466            AttValFollow = pablo.Advance(AttValEnd)
467            tag_Callouts.AttVal_ends |= AttValFollow
468            #  AfterWS = pablo.ScanThru(AttValFollow, lex.WS)
469            if AttValFollow & lex.WS: 
470                AfterWS = pablo.ScanThru(AttValFollow, lex.WS)
471                AttListEnd |= AfterWS & AttListDelim
472                AttNameStart = AfterWS & ~AttListDelim
473            else: 
474                AttListEnd |= AttValFollow & AttListDelim   
475                AttNameStart = AttValFollow & ~AttListDelim
476            ParseError |= AttValFollow & AttNameStart
477    else:
478        # No WS character after ElemName; must be at the end
479        AttListEnd = tag_Callouts.ElemName_ends & AttListDelim
480        ParseError |= tag_Callouts.ElemName_ends & ~AttListDelim
481
482    STagEnds = AttListEnd & lex.RAngle
483    # Mark any "/" characters found as the ends of empty element tags.
484    tag_Callouts.EmptyTag_marks = pablo.Advance(AttListEnd & lex.Slash)
485   
486    ParseError |= tag_Callouts.EmptyTag_marks & ~lex.RAngle
487
488    # End Tag Parsing
489
490    EndTagEnds = pablo.AdvanceThenScanThru(tag_Callouts.EndTag_marks, lex.NameScan)
491    if EndTagEnds & lex.WS:
492        EndTagEnds = pablo.ScanThru(EndTagEnds, lex.WS)
493    ParseError |= EndTagEnds & ~lex.RAngle
494    pablo.assert_0(ParseError, "Tag parsing error found")
495       
496       
497    # Attribute value spans
498#    tag_Callouts.AttVal_spans = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
499    tag_Callouts.AttVal_spans = pablo.SpanUpTo(tag_Callouts.AttVal_starts, tag_Callouts.AttVal_ends)
500
501def Parse_refs(lex, marker, ref_Callouts):
502    ref_Callouts.GenRef_starts = 0
503    ref_Callouts.GenRef_ends = 0
504    ref_Callouts.DecRef_starts = 0
505    ref_Callouts.DecRef_ends = 0
506    ref_Callouts.HexRef_starts = 0
507    ref_Callouts.HexRef_ends = 0
508    ref_error = 0
509
510    # All remaining "&" must be reference start characters; parse them.
511    if marker.Ref_opener:
512        Ref_scope = pablo.Advance(marker.Ref_opener)
513        NumRef2 = Ref_scope & lex.Hash
514        ref_Callouts.GenRef_starts = Ref_scope &~ lex.Hash
515        NumRef3 = pablo.Advance(NumRef2)
516        HexRef3 = NumRef3 & lex.x
517        ref_Callouts.DecRef_starts = NumRef3 &~ lex.x
518        ref_Callouts.HexRef_starts = pablo.Advance(HexRef3) 
519        ref_Callouts.GenRef_ends = pablo.ScanThru(ref_Callouts.GenRef_starts, lex.NameScan)
520        ref_Callouts.DecRef_ends = pablo.ScanThru(ref_Callouts.DecRef_starts, lex.Digit)
521        ref_Callouts.HexRef_ends = pablo.ScanThru(ref_Callouts.HexRef_starts, lex.Hex)
522        # Error checks
523        # At least one digit required for DecRef, one hex digit for HexRef.
524        ref_error1 = ref_Callouts.DecRef_starts &~ lex.Digit
525        ref_error2 = ref_Callouts.HexRef_starts &~ lex.Hex
526        # Semicolon terminator required (also covers unterminated at EOF).
527        ref_ends = ref_Callouts.GenRef_ends | ref_Callouts.DecRef_ends | ref_Callouts.HexRef_ends
528        ref_error3 = ref_ends &~ lex.Semicolon
529        pablo.assert_0(ref_error1 | ref_error2 | ref_error3, "Reference error found")
530           
531
532
533def Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, check_streams):
534#    PI_names = ctCDPI_Callouts.PI_name_ends - ctCDPI_Callouts.PI_name_starts
535#    GenRefs = ref_Callouts.GenRef_ends - ref_Callouts.GenRef_starts
536#    ElemNames = tag_Callouts.ElemName_ends - tag_Callouts.ElemName_starts
537#    AttNames = tag_Callouts.AttName_ends - tag_Callouts.AttName_starts
538    PI_names = pablo.SpanUpTo(ctCDPI_Callouts.PI_name_starts, ctCDPI_Callouts.PI_name_ends)
539    GenRefs = pablo.SpanUpTo(ref_Callouts.GenRef_starts, ref_Callouts.GenRef_ends)
540    ElemNames = pablo.SpanUpTo(tag_Callouts.ElemName_starts, tag_Callouts.ElemName_ends)
541    AttNames = pablo.SpanUpTo(tag_Callouts.AttName_starts, tag_Callouts.AttName_ends)
542    qname_stream =     ElemNames | AttNames
543    ncname_stream = PI_names | GenRefs
544    name_stream = qname_stream | ncname_stream
545    name_start = name_stream &~ pablo.Advance(name_stream)
546    name_cursor = name_stream & ~pablo.Advance(name_stream)
547    void_prefix_err = name_cursor & lex.Colon
548    namespace_sep = pablo.ScanThru(name_cursor, lex.NameScan &~ lex.Colon) & lex.Colon
549    local_part_start = pablo.Advance(namespace_sep)
550    local_part_err = local_part_start &~ lex.NameScan
551    colon2_err = pablo.ScanThru(local_part_start, lex.NameScan &~ lex.Colon) & lex.Colon
552    ncname_err = ncname_stream & lex.Colon
553    pablo.assert_0(void_prefix_err | local_part_err | colon2_err | ncname_err, "name syntax error")
554       
555             
556    check_streams.non_ascii_name_starts = name_start &~lex.ASCII_name_start
557    check_streams.non_ascii_names = (name_stream &~ name_start) & ~lex.ASCII_name_char & ~u8.suffix
558   
559def Do_check_streams(marker, tag_Callouts, check_streams):
560    pablo.assert_0(marker.CD_closer & ~tag_Callouts.AttVal_spans, "Error: ]]> in text")
561    check_streams.tag_marks = tag_Callouts.EmptyTag_marks | tag_Callouts.ElemName_starts | tag_Callouts.EndTag_marks | tag_Callouts.AttName_starts
562    check_streams.name_follows = tag_Callouts.ElemName_ends | tag_Callouts.AttName_ends
563    check_streams.att_refs = tag_Callouts.AttVal_spans & marker.Ref_opener
564     
Note: See TracBrowser for help on using the repository browser.