source: proto/parabix2/parabix2_pablo.py @ 899

Last change on this file since 899 was 899, checked in by cameron, 8 years ago

Multiplex scope streams

File size: 18.9 KB
RevLine 
[757]1# -*- coding: utf-8 -*-
2#
3# parabix2_compilable.py
4#
5# Parallel XML Parsing with Bitstream Addition
[881]6#
[757]7# - Complete prototype for all bitstream computations in Parabix2
[881]8# - Optimized for compilation
9# - Separate compilation
10
[757]11# Robert D. Cameron
12# July 29, 2010
13#
14
15#import bitutil
16
17class u8 ():
18  unibyte = 0
19  prefix = 0
20  prefix2 = 0
21  prefix3 = 0
22  prefix4 = 0
23  suffix = 0
24  badprefix = 0
25  xE0 = 0
26  xED = 0
27  xF0 = 0
28  xF4 = 0
29  xA0_xBF = 0
30  x80_x9F = 0
31  x90_xBF = 0
32  x80_x8F = 0
33  xEF = 0
34  xBF = 0
35  xBE = 0
36  scope22 = 0
37  scope32 = 0
38  scope33 = 0
39  scope42 = 0
40  scope43 = 0
41  scope44 = 0
42  xE0_scope = 0
43  xED_scope = 0
44  xF0_scope = 0
45  xF4_scope = 0
46  xEF_scope = 0
[881]47 
48  FFFE_FFFF = 0
[757]49  error = 0
50
[881]51class Lex ():
[860]52        x00_x1F = 0
53        CR = 0
54        LF = 0
55        HT = 0
56        SP = 0
57        CRLF = 0
58        RefStart = 0
59        Semicolon = 0 
60        Colon = 0
61        LAngle = 0
62        RAngle = 0
63        LBracket = 0
64        RBracket = 0
65        Exclam = 0
66        QMark = 0
67        Hyphen = 0
68        Equals = 0
69        SQuote = 0
70        DQuote = 0
71        Slash = 0
72        Hash = 0
73        x = 0
74        ASCII_name_start = 0
75        ASCII_name_char = 0
76        NameScan = 0
77        Digit = 0
78        Hex = 0
79        WS = 0
[757]80
[881]81class Scope1 ():
[860]82        RefStart = 0
83        LAngle = 0
84        Hyphen = 0
85        QMark = 0
86        RBracket = 0
[859]87
[881]88class CtCDPI_Callouts():
[896]89        CD_end = 0
[860]90        Ct_starts = 0
91        Ct_ends = 0
92        CD_starts = 0
93        CD_ends = 0
94        PI_starts = 0
95        PI_name_starts = 0
96        PI_name_ends = 0
97        PI_ends = 0
[757]98        CtCDPI_mask = 0
99        error = 0
100
[881]101class Ref_Callouts():
[861]102        GenRef_starts = 0
103        GenRef_ends = 0
104        DecRef_starts = 0
105        DecRef_ends = 0
106        HexRef_starts = 0
107        HexRef_ends = 0
[757]108        error = 0
109
[881]110class Tag_Callouts():
[862]111        ElemName_starts = 0
112        ElemName_ends = 0
113        AttName_starts = 0
114        AttName_ends = 0
115        AttVal_starts = 0
116        AttVal_ends = 0
[881]117        AttVal_spans = 0
[862]118        EmptyTag_marks = 0
119        EndTag_marks = 0
[881]120        LAngleFollow = 0
[757]121        error = 0
[881]122       
123class Xml_char():
124        error = 0       
125       
126class Basis_bits():     
127        bit_0 = 0
128        bit_1 = 0
129        bit_2 = 0
130        bit_3 = 0
131        bit_4 = 0
132        bit_5 = 0
133        bit_6 = 0
134        bit_7 = 0
135       
136class Masks():
137        EOF_mask = 0   
[757]138
[881]139class Post_process():
140        misc_mask = 0
141        non_ascii_name_starts = 0
142        non_ascii_names = 0
143        tag_marks = 0 
144        name_follows = 0 
145        att_refs = 0 
146        error_mask = 0
[757]147
[881]148class Xml_names():
149        namespace_error = 0
150
151def Classify_bytes_Validate_utf8(basis_bits, lex, u8): 
152        temp1 = (basis_bits.bit_0 | basis_bits.bit_1);
153        temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3);
[757]154        temp3 = (temp2 &~ temp1);
[881]155        temp4 = (basis_bits.bit_5 &~ basis_bits.bit_4);
156        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7);
[757]157        temp6 = (temp4 & temp5);
158        lex.RefStart = (temp3 & temp6);
[881]159        temp7 = (basis_bits.bit_2 & basis_bits.bit_3);
[757]160        temp8 = (temp7 &~ temp1);
[881]161        temp9 = (basis_bits.bit_4 &~ basis_bits.bit_5);
162        temp10 = (basis_bits.bit_6 & basis_bits.bit_7);
[757]163        temp11 = (temp9 & temp10);
164        lex.Semicolon = (temp8 & temp11);
[881]165        temp12 = (basis_bits.bit_4 & basis_bits.bit_5);
166        temp13 = (basis_bits.bit_6 | basis_bits.bit_7);
[757]167        temp14 = (temp12 &~ temp13);
168        lex.LAngle = (temp8 & temp14);
169        temp15 = (temp12 & temp5);
170        lex.RAngle = (temp8 & temp15);
[881]171        temp16 = (basis_bits.bit_1 &~ basis_bits.bit_0);
172        temp17 = (basis_bits.bit_3 &~ basis_bits.bit_2);
[757]173        temp18 = (temp16 & temp17);
174        lex.LBracket = (temp18 & temp11);
[881]175        temp19 = (basis_bits.bit_7 &~ basis_bits.bit_6);
[757]176        temp20 = (temp12 & temp19);
177        lex.RBracket = (temp18 & temp20);
[881]178        temp21 = (basis_bits.bit_4 | basis_bits.bit_5);
[757]179        temp22 = (temp19 &~ temp21);
180        lex.Exclam = (temp3 & temp22);
181        temp23 = (temp12 & temp10);
182        lex.QMark = (temp8 & temp23);
183        lex.Hyphen = (temp3 & temp20);
184        lex.Equals = (temp8 & temp20);
185        temp24 = (temp4 & temp10);
186        lex.SQuote = (temp3 & temp24);
187        temp25 = (temp5 &~ temp21);
188        lex.DQuote = (temp3 & temp25);
189        lex.Slash = (temp3 & temp23);
190        temp26 = (temp10 &~ temp21);
191        lex.Hash = (temp3 & temp26);
192        temp27 = (temp16 & temp7);
193        temp28 = (temp9 &~ temp13);
194        lex.x = (temp27 & temp28);
195        temp29 = (temp9 & temp5);
196        lex.Colon = (temp8 & temp29);
197        temp30 = (temp18 & temp23);
198        temp31 = (temp30 | lex.Colon);
[881]199        temp32 = (temp16 &~ basis_bits.bit_2);
200        temp33 = (basis_bits.bit_5 | temp10);
201        temp34 = (basis_bits.bit_4 & temp33);
[757]202        temp35 = (~temp34);
203        temp36 = (temp21 | temp13);
[881]204        temp37 = ((basis_bits.bit_3 & temp35)|(~(basis_bits.bit_3) & temp36));
[757]205        temp38 = (temp32 & temp37);
206        temp39 = (temp31 | temp38);
[881]207        temp40 = (temp16 & basis_bits.bit_2);
[757]208        temp41 = (temp40 & temp37);
209        lex.ASCII_name_start = (temp39 | temp41);
210        temp42 = (temp30 | lex.Hyphen);
211        temp43 = (temp3 & temp15);
212        temp44 = (temp42 | temp43);
213        temp45 = (temp8 &~ temp34);
214        temp46 = (temp44 | temp45);
215        temp47 = (temp46 | temp38);
216        lex.ASCII_name_char = (temp47 | temp41);
[881]217        lex.NameScan = (lex.ASCII_name_char | basis_bits.bit_0);
218        temp48 = (temp1 | basis_bits.bit_2);
[858]219        lex.x00_x1F = (~temp48);
[881]220        temp49 = (basis_bits.bit_2 | basis_bits.bit_3);
[757]221        temp50 = (temp1 | temp49);
[858]222        lex.CR = (temp20 &~ temp50);
223        lex.LF = (temp29 &~ temp50);
[757]224        temp51 = (temp9 & temp19);
[858]225        lex.HT = (temp51 &~ temp50);
226        lex.SP = (temp3 &~ temp36);
[757]227        temp52 = (temp20 | temp29);
228        temp53 = (temp52 | temp51);
229        temp54 = (temp53 &~ temp50);
[858]230        lex.WS = (temp54 | lex.SP);
[881]231        temp55 = (basis_bits.bit_5 | basis_bits.bit_6);
232        temp56 = (basis_bits.bit_4 & temp55);
[757]233        lex.Digit = (temp8 &~ temp56);
234        temp57 = (temp16 &~ temp49);
[881]235        temp58 = (temp57 &~ basis_bits.bit_4);
[757]236        temp59 = (~temp10);
[881]237        temp60 = ((basis_bits.bit_5 & temp59)|(~(basis_bits.bit_5) & temp13));
[757]238        temp61 = (temp58 & temp60);
239        temp62 = (lex.Digit | temp61);
240        temp63 = (temp16 & temp2);
[881]241        temp64 = (temp63 &~ basis_bits.bit_4);
[757]242        temp65 = (temp64 & temp60);
243        lex.Hex = (temp62 | temp65);
[881]244       
245        ### Validate_utf8(basis_bits, u8):
246        u8.unibyte = (~basis_bits.bit_0);
[757]247        u8.suffix = 0
248        u8.error = 0
[881]249        u8.FFFE_FFFF = 0
250        u8anyscope = 0 #local
251        if basis_bits.bit_0:
252                u8.prefix = (basis_bits.bit_0 & basis_bits.bit_1);
253                u8.prefix2 = (u8.prefix &~ basis_bits.bit_2);
[757]254                u8.prefix3 = (u8.prefix & temp2);
255                u8.prefix4 = (u8.prefix & temp7);
[881]256                u8.suffix = (basis_bits.bit_0 &~ basis_bits.bit_1);
[757]257                temp66 = (u8.prefix &~ temp49);
[881]258                temp67 = (temp21 | basis_bits.bit_6);
[757]259                temp68 = (temp66 &~ temp67);
[881]260                temp69 = (basis_bits.bit_5 & temp13);
261                temp70 = (basis_bits.bit_4 | temp69);
[757]262                temp71 = (u8.prefix4 & temp70);
263                u8.badprefix = (temp68 | temp71);
[775]264                u8.error = u8.badprefix
[757]265                u8.scope22 = bitutil.Advance(u8.prefix2)
[775]266                u8anyscope = u8.scope22
[881]267                if u8.prefix3 | u8.prefix4:
[775]268                        xE0 = (u8.prefix3 &~ temp36);
269                        xED = (u8.prefix3 & temp20);
270                        xF0 = (u8.prefix4 &~ temp36);
271                        temp72 = (temp4 &~ temp13);
272                        xF4 = (u8.prefix4 & temp72);
[881]273                        u8.xA0_xBF = (u8.suffix & basis_bits.bit_2);
274                        u8.x80_x9F = (u8.suffix &~ basis_bits.bit_2);
[775]275                        u8.x90_xBF = (u8.suffix & temp49);
276                        u8.x80_x8F = (u8.suffix &~ temp49);
277                        xEF = (u8.prefix3 & temp23);
278                        temp73 = (u8.suffix & temp7);
279                        u8.xBF = (temp73 & temp23);
280                        u8.xBE = (temp73 & temp15);
281                        u8.xE0_scope = bitutil.Advance(xE0);
282                        u8.xED_scope = bitutil.Advance(xED);
283                        u8.xF0_scope = bitutil.Advance(xF0);
284                        u8.xF4_scope = bitutil.Advance(xF4);
285                        u8.xEF_scope = bitutil.Advance(xEF);
286                        u8.scope32 = bitutil.Advance(u8.prefix3)
287                        u8.scope33 = bitutil.Advance(u8.scope32)
288                        u8.scope42 = bitutil.Advance(u8.prefix4)
289                        u8.scope43 = bitutil.Advance(u8.scope42)
290                        u8.scope44 = bitutil.Advance(u8.scope43)
[757]291
[775]292                        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
293                        u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43
[757]294               
[775]295                        u8error1 = u8.xE0_scope & u8.x80_x9F
296                        u8error2 = u8.xED_scope & u8.xA0_xBF
297                        u8error3 = u8.xF0_scope & u8.x80_x8F
298                        u8error4 = u8.xF4_scope & u8.x90_xBF
[757]299       
[775]300                        u8.error |= u8error1 | u8error2 | u8error3 | u8error4
[757]301
[775]302                        EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
[757]303
[881]304                        u8.FFFE_FFFF = (EF_BF_pending & (u8.xBE | u8.xBF))
[775]305        u8mismatch = u8anyscope ^ u8.suffix
306        u8.error |= u8mismatch
[881]307       
[896]308def Add_scope_streams(lex, scope1):
[899]309        #scope1.LAngle = bitutil.Advance(lex.LAngle)
310        #scope1.Hyphen = bitutil.Advance(lex.Hyphen)
311        #scope1.QMark = bitutil.Advance(lex.QMark)
312        v = lex.LAngle | lex.Hyphen
313        w = lex.Hyphen | lex.QMark
314        v1 = bitutil.Advance(v)
315        w1 = bitutil.Advance(w)
316        scope1.LAngle = v1 &~ w1
317        scope1.Hyphen = v1 & w1
318        scope1.QMark = w1 &~ v1
[757]319
[896]320def Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, masks, post_process):
321        ctCDPI_Callouts.CD_end = 0
[860]322        ctCDPI_Callouts.Ct_starts = 0
323        ctCDPI_Callouts.Ct_ends = 0
324        ctCDPI_Callouts.CD_starts = 0
325        ctCDPI_Callouts.CD_ends = 0
326        ctCDPI_Callouts.PI_starts = 0
327        ctCDPI_Callouts.PI_name_starts = 0
328        ctCDPI_Callouts.PI_name_ends = 0
329        ctCDPI_Callouts.PI_ends = 0
330        ctCDPI_Callouts.CtCDPI_mask = 0
331        ctCDPI_Callouts.error = 0
[757]332        CtCDPI_starts = 0
333
[896]334        if lex.RBracket:
335                scope1.RBracket = bitutil.Advance(lex.RBracket)
336                ctCDPI_Callouts.CD_end = bitutil.Advance(scope1.RBracket & lex.RBracket) & lex.RAngle
337        PI_start = scope1.LAngle & lex.QMark
338        CtCD_start = scope1.LAngle & lex.Exclam
339        CtCDPI_start = PI_start | CtCD_start
340
341        DoubleHyphen = scope1.Hyphen & lex.Hyphen
342        PI_end = scope1.QMark & lex.RAngle
343
344
[757]345        # Scanning streams
[896]346        CtCDPI_scan = ~CtCDPI_start & masks.EOF_mask
347        Ct_end_scan = ~DoubleHyphen & masks.EOF_mask
348        CD_end_scan = ~ctCDPI_Callouts.CD_end & masks.EOF_mask
349        PI_end_scan = ~PI_end & masks.EOF_mask
[757]350        #
351        # Initiate the scan
352        CtCDPI_Cursor = 1
353        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
[881]354        CtCDPI_Cursor &= masks.EOF_mask
[757]355        while CtCDPI_Cursor:
356                CtCDPI_starts |= CtCDPI_Cursor
[896]357                PI_Cursor = CtCDPI_Cursor & PI_start
[757]358                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
359                CD_Cursor = CD_Ct_Cursor & lex.LBracket
360                Ct_Cursor = CD_Ct_Cursor & lex.Hyphen
[860]361                ctCDPI_Callouts.PI_starts |= PI_Cursor
362                ctCDPI_Callouts.CD_starts |= CD_Cursor
363                ctCDPI_Callouts.Ct_starts |= Ct_Cursor
[757]364                Ct_Cursor = bitutil.Advance(Ct_Cursor) 
365                Ct_errors |= Ct_Cursor & ~ lex.Hyphen
366                Ct_Cursor = bitutil.Advance(Ct_Cursor)
367                Ct_end_scan |= Ct_Cursor
368                #PI_Cursor = bitutil.ScanThru(PI_Cursor, PI_end_scan)
369                PI_Cursor = bitutil.Advance(PI_Cursor)
[860]370                ctCDPI_Callouts.PI_name_starts |= PI_Cursor
[757]371                PI_name_end = bitutil.ScanThru(PI_Cursor, lex.NameScan)
[860]372                ctCDPI_Callouts.PI_name_ends |= PI_name_end
[757]373                PI_Cursor = bitutil.ScanThru(PI_name_end, PI_end_scan)
374                CD_Cursor = bitutil.ScanThru(CD_Cursor, CD_end_scan)
375                Ct_Cursor = bitutil.Advance(bitutil.ScanThru(Ct_Cursor, Ct_end_scan))
[860]376                ctCDPI_Callouts.PI_ends |= PI_Cursor
377                ctCDPI_Callouts.CD_ends |= CD_Cursor
378                ctCDPI_Callouts.Ct_ends |= Ct_Cursor
[757]379                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
380                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
[881]381                CtCDPI_Cursor &= masks.EOF_mask
[757]382       
[881]383                ctCDPI_Callouts.CtCDPI_mask = bitutil.Advance(ctCDPI_Callouts.CD_ends | ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - CtCDPI_starts             
[860]384                #ctCDPI_Callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
385                ctCDPI_Callouts.error = Ct_errors | ctCDPI_Callouts.Ct_ends & ~lex.RAngle
[896]386                ctCDPI_Callouts.error |= bitutil.Advance(ctCDPI_Callouts.PI_name_ends & ~ lex.WS) & ~ PI_end
[860]387                ctCDPI_Callouts.error |= ctCDPI_Callouts.PI_name_starts & ctCDPI_Callouts.PI_name_ends
[757]388                # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
[881]389                ctCDPI_Callouts.error |= ctCDPI_Callouts.CtCDPI_mask &~ masks.EOF_mask
[757]390               
[881]391        post_process.misc_mask = (lex.WS | lex.LAngle | (bitutil.Advance(ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - (ctCDPI_Callouts.Ct_starts | ctCDPI_Callouts.PI_starts)) | CtCDPI_starts) & masks.EOF_mask
[757]392
[881]393def Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts, masks):
[896]394
[757]395       
396        # Delimiters for scans.
[881]397        DQuoteScan = ~(lex.DQuote | lex.LAngle) & masks.EOF_mask
398        SQuoteScan = ~(lex.SQuote | lex.LAngle) & masks.EOF_mask
[757]399        AttListDelim = lex.Slash | lex.RAngle
400       
401        # Start the parallel parsing by inspecting the character
402        # after the opening "<" of a tag.
[881]403        tag_Callouts.LAngleFollow = scope1.LAngle &~ ctCDPI_Callouts.CtCDPI_mask
404        tag_Callouts.ElemName_starts = tag_Callouts.LAngleFollow & ~lex.Slash
405        tag_Callouts.EndTag_marks = tag_Callouts.LAngleFollow & lex.Slash
[757]406       
407        # Start Tag/Empty Element Tag Parsing
408
409        # Advance all cursors by scanning through the tag name.
[862]410        tag_Callouts.ElemName_ends = bitutil.ScanThru(tag_Callouts.ElemName_starts, lex.NameScan)
[757]411        # Must have at least one name character for a legal start tag.
412        # Mark any occurrences of null names as errors.
[862]413        ParseError = tag_Callouts.ElemName_starts & tag_Callouts.ElemName_ends
[757]414       
415        # Initialize the accumulators for attribute name and value positions.
[862]416        tag_Callouts.AttName_starts = 0 
417        tag_Callouts.AttName_ends = 0
[757]418        EqToCheck = 0
[862]419        tag_Callouts.AttVal_starts = 0
[757]420        AttValEnds = 0
[862]421        tag_Callouts.AttVal_ends = 0
[757]422
423        # After the element name, there may or may not be an attlist.
[862]424        AfterWS = bitutil.ScanThru(tag_Callouts.ElemName_ends, lex.WS)
[757]425        AttListEnd = AfterWS & AttListDelim
426        AttNameStart = AfterWS & ~AttListDelim
427        # At least one WS character is required between ElemNames and AttNames.
[862]428        ParseError |= tag_Callouts.ElemName_ends & AttNameStart
[757]429
430        #
431        # The following loop iterates through attributes within a start tag.
432        # Because all start tags are processed in parallel, the number of
433        # iterations is the maximum number of attributes found in any one
434        # start tag, plus one.
435        while AttNameStart:
[881]436                tag_Callouts.AttName_starts |= AttNameStart
[757]437                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
[862]438                tag_Callouts.AttName_ends |= AttNameFollow
[757]439                # Scan through WS to the expected '=' delimiter.
440                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
441                EqToCheck |= EqExpected
442                AttValPos = bitutil.ScanThru(EqExpected, EqExpected | lex.WS)
[862]443                tag_Callouts.AttVal_starts |= AttValPos
[757]444                DQuoteAttVal = AttValPos & lex.DQuote
445                SQuoteAttVal = AttValPos & lex.SQuote
446                DQuoteAttEnd = bitutil.ScanThru(DQuoteAttVal, DQuoteAttVal | DQuoteScan)
447                SQuoteAttEnd = bitutil.ScanThru(SQuoteAttVal, SQuoteAttVal | SQuoteScan)
448                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
449                AttValEnds |= AttValEnd
450                AttValFollow = bitutil.Advance(AttValEnd)
[862]451                tag_Callouts.AttVal_ends |= AttValFollow
[757]452                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
453                AttListEnd |= AfterWS & AttListDelim
454                AttNameStart = AfterWS & ~AttListDelim
455
456        # No more attribute values to process when AttNameStart == 0.
457        STagEnds = AttListEnd & lex.RAngle
458        # Mark any "/" characters found as the ends of empty element tags.
[862]459        tag_Callouts.EmptyTag_marks = bitutil.Advance(AttListEnd & lex.Slash)
[757]460       
461        # Check for errors.
[862]462        ParseError |= tag_Callouts.AttVal_ends & tag_Callouts.AttName_starts # No intervening WS.
463        ParseError |= tag_Callouts.AttName_starts & tag_Callouts.AttName_ends # Null AttName
[757]464        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
[862]465        ParseError |= tag_Callouts.AttVal_starts & ~ (lex.DQuote | lex.SQuote)
[757]466        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
[862]467        ParseError |= tag_Callouts.EmptyTag_marks & ~lex.RAngle
[757]468
469        # End Tag Parsing
[862]470        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(tag_Callouts.EndTag_marks, tag_Callouts.EndTag_marks | lex.NameScan), lex.WS)
[757]471        ParseError |= EndTagEnds & ~lex.RAngle
[862]472        tag_Callouts.error = ParseError
[881]473               
474        # Attribute value spans
475        tag_Callouts.AttVal_spans = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
476                       
477def Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts):
[861]478        ref_Callouts.GenRef_starts = 0
479        ref_Callouts.GenRef_ends = 0
480        ref_Callouts.DecRef_starts = 0
481        ref_Callouts.DecRef_ends = 0
482        ref_Callouts.HexRef_starts = 0
483        ref_Callouts.HexRef_ends = 0
484        ref_Callouts.error = 0
485
[860]486        Ref1 = lex.RefStart &~ ctCDPI_Callouts.CtCDPI_mask
[866]487        # All remaining "&" must be reference start characters; parse them.
[757]488        if Ref1:
[859]489                scope1.RefStart = bitutil.Advance(Ref1)
490                NumRef2 = scope1.RefStart & lex.Hash
[861]491                ref_Callouts.GenRef_starts = scope1.RefStart &~ lex.Hash
[757]492                NumRef3 = bitutil.Advance(NumRef2)
493                HexRef3 = NumRef3 & lex.x
[861]494                ref_Callouts.DecRef_starts = NumRef3 &~ lex.x
495                ref_Callouts.HexRef_starts = bitutil.Advance(HexRef3) 
496                ref_Callouts.GenRef_ends = bitutil.ScanThru(ref_Callouts.GenRef_starts, lex.NameScan)
497                ref_Callouts.DecRef_ends = bitutil.ScanThru(ref_Callouts.DecRef_starts, lex.Digit)
498                ref_Callouts.HexRef_ends = bitutil.ScanThru(ref_Callouts.HexRef_starts, lex.Hex)
[757]499                # Error checks
500                # At least one digit required for DecRef, one hex digit for HexRef.
[861]501                ref_error1 = ref_Callouts.DecRef_starts &~ lex.Digit
502                ref_error2 = ref_Callouts.HexRef_starts &~ lex.Hex
[757]503                # Semicolon terminator required (also covers unterminated at EOF).
[881]504                ref_ends = ref_Callouts.GenRef_ends | ref_Callouts.DecRef_ends | ref_Callouts.HexRef_ends
[861]505                ref_error3 = ref_ends &~ lex.Semicolon
506                ref_Callouts.error = ref_error1 | ref_error2 | ref_error3
[757]507
[881]508def Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, post_process):
[861]509        PI_names = ctCDPI_Callouts.PI_name_ends - ctCDPI_Callouts.PI_name_starts
[881]510        GenRefs = ref_Callouts.GenRef_ends - ref_Callouts.GenRef_starts
[862]511        ElemNames = tag_Callouts.ElemName_ends - tag_Callouts.ElemName_starts
512        AttNames = tag_Callouts.AttName_ends - tag_Callouts.AttName_starts
513        qname_stream =  ElemNames | AttNames
[861]514        ncname_stream = PI_names | GenRefs
[757]515        name_stream = qname_stream | ncname_stream
516        name_start = name_stream &~ bitutil.Advance(name_stream)
517        name_cursor = name_stream & ~bitutil.Advance(name_stream)
518        void_prefix_err = name_cursor & lex.Colon
519        namespace_sep = bitutil.ScanThru(name_cursor, lex.NameScan &~ lex.Colon) & lex.Colon
520        local_part_start = bitutil.Advance(namespace_sep)
521        local_part_err = local_part_start &~ lex.NameScan
522        colon2_err = bitutil.ScanThru(local_part_start, lex.NameScan &~ lex.Colon) & lex.Colon
523        ncname_err = ncname_stream & lex.Colon
[881]524        xml_names.namespace_error = void_prefix_err | local_part_err | colon2_err | ncname_err
525                       
526        post_process.non_ascii_name_starts = name_start &~lex.ASCII_name_start
527        post_process.non_ascii_names = (name_stream &~ name_start) & ~lex.ASCII_name_char & ~u8.suffix
528
529#def main(basis_bits, lex, u8, scope1, ctCDPI_Callouts, masks, post_process, tag_Callouts, ref_Callouts, xml_names):
530def Main(basis_bits, lex, u8, xml_char, scope1, ctCDPI_Callouts, ref_Callouts, tag_Callouts, masks, xml_names, post_process):   
[757]531       
[881]532        # Classify bytes for UTF-8 processing, whitespace and control
533        # processing and XML lexical analysis.
534        # Classify_bytes(basis_bits, lex)
535
536        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams
537        # Validate_utf8(basis_bits, u8)
538                               
539        Classify_bytes_Validate_utf8(basis_bits, lex, u8)
540
541        # Check for illegal control characters
542        xml_char.error = (lex.x00_x1F &~ lex.WS & masks.EOF_mask)
543        xml_char.error |= u8.FFFE_FFFF
544
[896]545        Add_scope_streams(lex, scope1)
[881]546   
[896]547        # Parse all comments, CDATA sections and processing instructions.
548        Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, masks, post_process)
[881]549               
550        # All remaining '<' must be tag start characters; parse tags.
551        Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts, masks) 
552
553        # All remaining '&' must be reference start characters; parse them.
554        Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts)
[757]555       
[881]556        # Ensure that no occurrence of ]]> occurs outside of markup.
[896]557        CD_end_error = ctCDPI_Callouts.CD_end & ~(ctCDPI_Callouts.CtCDPI_mask | tag_Callouts.AttVal_spans)
[881]558       
559        # Validate XML namespaces and generate bit streams to post validate non-ascii range XML names
560        Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, post_process)
561               
[757]562        # Consolidate and check for errors
[881]563        post_process.error_mask = u8.error | xml_char.error | ctCDPI_Callouts.error | tag_Callouts.error | CD_end_error | ref_Callouts.error | xml_names.namespace_error
[757]564
[881]565        post_process.tag_marks = tag_Callouts.EmptyTag_marks | tag_Callouts.LAngleFollow | tag_Callouts.AttName_starts
566        post_process.name_follows = tag_Callouts.ElemName_ends | tag_Callouts.AttName_ends
[896]567        post_process.att_refs = tag_Callouts.AttVal_spans & scope1.RefStart             
Note: See TracBrowser for help on using the repository browser.