source: proto/SymbolTable/parabix2_symtab_pbs_adv.py @ 1228

Last change on this file since 1228 was 1228, checked in by vla24, 8 years ago

Integrated symbol table with xmlwf. There are various implementations for the symbol table, please read /proto/SymbolTable/README_SymbolTable for more information.

File size: 22.7 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# parabix2_compilable.py
4#
5# Parallel XML Parsing with Bitstream Addition
6#
7# - Complete prototype for all bitstream computations in Parabix2
8# - Optimized for compilation
9# - Separate compilation
10
11# Robert D. Cameron
12# July 29, 2010
13#
14
15#import bitutil
16
17class u8 ():
18  unibyte = 0
19  prefix = 0
20  prefix2 = 0
21  prefix3 = 0
22  prefix4 = 0
23  suffix = 0
24  badprefix = 0
25  xE0 = 0
26  xED = 0
27  xF0 = 0
28  xF4 = 0
29  xA0_xBF = 0
30  x80_x9F = 0
31  x90_xBF = 0
32  x80_x8F = 0
33  xEF = 0
34  xBF = 0
35  xBE = 0
36  scope22 = 0
37  scope32 = 0
38  scope33 = 0
39  scope42 = 0
40  scope43 = 0
41  scope44 = 0
42  xE0_scope = 0
43  xED_scope = 0
44  xF0_scope = 0
45  xF4_scope = 0
46  xEF_scope = 0
47 
48  FFFE_FFFF = 0
49  error = 0
50
51class Lex ():
52        CR = 0
53        LF = 0
54        HT = 0
55        SP = 0
56        CRLF = 0
57        RefStart = 0
58        Semicolon = 0 
59        Colon = 0
60        LAngle = 0
61        RAngle = 0
62        LBracket = 0
63        RBracket = 0
64        Exclam = 0
65        QMark = 0
66        Hyphen = 0
67        Equals = 0
68        SQuote = 0
69        DQuote = 0
70        Slash = 0
71        Hash = 0
72        x = 0
73        ASCII_name_start = 0
74        ASCII_name_char = 0
75        NameScan = 0
76        Digit = 0
77        Hex = 0
78        WS = 0
79        error = 0
80
81class Scope1 ():
82        RefStart = 0
83        LAngle = 0
84        Hyphen = 0
85        QMark = 0
86        RBracket = 0
87
88class CtCDPI_Callouts():
89        CD_end = 0
90        Ct_starts = 0
91        Ct_ends = 0
92        CD_starts = 0
93        CD_ends = 0
94        PI_starts = 0
95        PI_name_starts = 0
96        PI_name_ends = 0
97        PI_ends = 0
98        CtCDPI_mask = 0
99        error = 0
100
101class Ref_Callouts():
102        GenRef_starts = 0
103        GenRef_ends = 0
104        DecRef_starts = 0
105        DecRef_ends = 0
106        HexRef_starts = 0
107        HexRef_ends = 0
108        error = 0
109
110class Hash_data():
111        Hash_value = 0
112
113class Tag_Callouts():
114        ElemName_starts = 0
115        ElemName_ends = 0
116        ElemName_ends_1 = 0
117        ElemName_ends_2 = 0
118        ElemName_ends_3 = 0
119        ElemName_ends_4 = 0
120        ElemName_ends_5 = 0
121        ElemName_ends_6 = 0
122        ElemName_ends_7 = 0
123        ElemName_ends_8 = 0
124        ElemName_ends_9 = 0
125        ElemName_ends_10 = 0
126        ElemName_ends_11 = 0
127        ElemName_ends_12 = 0
128        ElemName_ends_13 = 0
129        ElemName_ends_14 = 0
130        ElemName_ends_15 = 0
131        ElemName_ends_16 = 0
132        ElemName_ends_17_and_longer = 0
133        AttName_starts = 0
134        AttName_ends = 0
135        AttVal_starts = 0
136        AttVal_ends = 0
137        AttVal_spans = 0
138        EmptyTag_marks = 0
139        EndTag_marks = 0
140        LAngleFollow = 0
141        error = 0
142
143class Basis_bits():     
144        bit_0 = 0
145        bit_1 = 0
146        bit_2 = 0
147        bit_3 = 0
148        bit_4 = 0
149        bit_5 = 0
150        bit_6 = 0
151        bit_7 = 0
152       
153class Check_streams():
154        misc_mask = 0
155        non_ascii_name_starts = 0
156        non_ascii_names = 0
157        tag_marks = 0 
158        name_follows = 0 
159        att_refs = 0 
160        error_mask = 0
161
162class Xml_names():
163        namespace_error = 0
164
165def Classify_bytes_Validate_utf8(basis_bits, lex, u8): 
166        temp1 = (basis_bits.bit_0 | basis_bits.bit_1);
167        temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3);
168        temp3 = (temp2 &~ temp1);
169        temp4 = (basis_bits.bit_5 &~ basis_bits.bit_4);
170        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7);
171        temp6 = (temp4 & temp5);
172        lex.RefStart = (temp3 & temp6);
173        temp7 = (basis_bits.bit_2 & basis_bits.bit_3);
174        temp8 = (temp7 &~ temp1);
175        temp9 = (basis_bits.bit_4 &~ basis_bits.bit_5);
176        temp10 = (basis_bits.bit_6 & basis_bits.bit_7);
177        temp11 = (temp9 & temp10);
178        lex.Semicolon = (temp8 & temp11);
179        temp12 = (basis_bits.bit_4 & basis_bits.bit_5);
180        temp13 = (basis_bits.bit_6 | basis_bits.bit_7);
181        temp14 = (temp12 &~ temp13);
182        lex.LAngle = (temp8 & temp14);
183        temp15 = (temp12 & temp5);
184        lex.RAngle = (temp8 & temp15);
185        temp16 = (basis_bits.bit_1 &~ basis_bits.bit_0);
186        temp17 = (basis_bits.bit_3 &~ basis_bits.bit_2);
187        temp18 = (temp16 & temp17);
188        lex.LBracket = (temp18 & temp11);
189        temp19 = (basis_bits.bit_7 &~ basis_bits.bit_6);
190        temp20 = (temp12 & temp19);
191        lex.RBracket = (temp18 & temp20);
192        temp21 = (basis_bits.bit_4 | basis_bits.bit_5);
193        temp22 = (temp19 &~ temp21);
194        lex.Exclam = (temp3 & temp22);
195        temp23 = (temp12 & temp10);
196        lex.QMark = (temp8 & temp23);
197        lex.Hyphen = (temp3 & temp20);
198        lex.Equals = (temp8 & temp20);
199        temp24 = (temp4 & temp10);
200        lex.SQuote = (temp3 & temp24);
201        temp25 = (temp5 &~ temp21);
202        lex.DQuote = (temp3 & temp25);
203        lex.Slash = (temp3 & temp23);
204        temp26 = (temp10 &~ temp21);
205        lex.Hash = (temp3 & temp26);
206        temp27 = (temp16 & temp7);
207        temp28 = (temp9 &~ temp13);
208        lex.x = (temp27 & temp28);
209        temp29 = (temp9 & temp5);
210        lex.Colon = (temp8 & temp29);
211        temp30 = (temp18 & temp23);
212        temp31 = (temp30 | lex.Colon);
213        temp32 = (temp16 &~ basis_bits.bit_2);
214        temp33 = (basis_bits.bit_5 | temp10);
215        temp34 = (basis_bits.bit_4 & temp33);
216        temp35 = (~temp34);
217        temp36 = (temp21 | temp13);
218        temp37 = ((basis_bits.bit_3 & temp35)|(~(basis_bits.bit_3) & temp36));
219        temp38 = (temp32 & temp37);
220        temp39 = (temp31 | temp38);
221        temp40 = (temp16 & basis_bits.bit_2);
222        temp41 = (temp40 & temp37);
223        lex.ASCII_name_start = (temp39 | temp41);
224        temp42 = (temp30 | lex.Hyphen);
225        temp43 = (temp3 & temp15);
226        temp44 = (temp42 | temp43);
227        temp45 = (temp8 &~ temp34);
228        temp46 = (temp44 | temp45);
229        temp47 = (temp46 | temp38);
230        lex.ASCII_name_char = (temp47 | temp41);
231        lex.NameScan = (lex.ASCII_name_char | basis_bits.bit_0);
232        temp48 = (temp1 | basis_bits.bit_2);
233        x00_x1F = (~temp48);
234        temp49 = (basis_bits.bit_2 | basis_bits.bit_3);
235        temp50 = (temp1 | temp49);
236        lex.CR = (temp20 &~ temp50);
237        lex.LF = (temp29 &~ temp50);
238        temp51 = (temp9 & temp19);
239        lex.HT = (temp51 &~ temp50);
240        lex.SP = (temp3 &~ temp36);
241        temp52 = (temp20 | temp29);
242        temp53 = (temp52 | temp51);
243        temp54 = (temp53 &~ temp50);
244        lex.WS = (temp54 | lex.SP);
245        temp55 = (basis_bits.bit_5 | basis_bits.bit_6);
246        temp56 = (basis_bits.bit_4 & temp55);
247        lex.Digit = (temp8 &~ temp56);
248        temp57 = (temp16 &~ temp49);
249        temp58 = (temp57 &~ basis_bits.bit_4);
250        temp59 = (~temp10);
251        temp60 = ((basis_bits.bit_5 & temp59)|(~(basis_bits.bit_5) & temp13));
252        temp61 = (temp58 & temp60);
253        temp62 = (lex.Digit | temp61);
254        temp63 = (temp16 & temp2);
255        temp64 = (temp63 &~ basis_bits.bit_4);
256        temp65 = (temp64 & temp60);
257        lex.Hex = (temp62 | temp65);
258        lex.error = x00_x1F &~ lex.WS
259       
260        ### Validate_utf8(basis_bits, u8):
261        u8.unibyte = (~basis_bits.bit_0);
262        u8.suffix = 0
263        u8.error = 0
264        u8.FFFE_FFFF = 0
265        u8anyscope = 0 #local
266        if basis_bits.bit_0:
267                u8.prefix = (basis_bits.bit_0 & basis_bits.bit_1);
268                u8.prefix2 = (u8.prefix &~ basis_bits.bit_2);
269                u8.prefix3 = (u8.prefix & temp2);
270                u8.prefix4 = (u8.prefix & temp7);
271                u8.suffix = (basis_bits.bit_0 &~ basis_bits.bit_1);
272                temp66 = (u8.prefix &~ temp49);
273                temp67 = (temp21 | basis_bits.bit_6);
274                temp68 = (temp66 &~ temp67);
275                temp69 = (basis_bits.bit_5 & temp13);
276                temp70 = (basis_bits.bit_4 | temp69);
277                temp71 = (u8.prefix4 & temp70);
278                u8.badprefix = (temp68 | temp71);
279                u8.error = u8.badprefix
280                u8.scope22 = bitutil.Advance(u8.prefix2)
281                u8anyscope = u8.scope22
282                if u8.prefix3 | u8.prefix4:
283                        xE0 = (u8.prefix3 &~ temp36);
284                        xED = (u8.prefix3 & temp20);
285                        xF0 = (u8.prefix4 &~ temp36);
286                        temp72 = (temp4 &~ temp13);
287                        xF4 = (u8.prefix4 & temp72);
288                        u8.xA0_xBF = (u8.suffix & basis_bits.bit_2);
289                        u8.x80_x9F = (u8.suffix &~ basis_bits.bit_2);
290                        u8.x90_xBF = (u8.suffix & temp49);
291                        u8.x80_x8F = (u8.suffix &~ temp49);
292                        xEF = (u8.prefix3 & temp23);
293                        temp73 = (u8.suffix & temp7);
294                        u8.xBF = (temp73 & temp23);
295                        u8.xBE = (temp73 & temp15);
296                        u8.xE0_scope = bitutil.Advance(xE0);
297                        u8.xED_scope = bitutil.Advance(xED);
298                        u8.xF0_scope = bitutil.Advance(xF0);
299                        u8.xF4_scope = bitutil.Advance(xF4);
300                        u8.xEF_scope = bitutil.Advance(xEF);
301                        u8.scope32 = bitutil.Advance(u8.prefix3)
302                        u8.scope33 = bitutil.Advance(u8.scope32)
303                        u8.scope42 = bitutil.Advance(u8.prefix4)
304                        u8.scope43 = bitutil.Advance(u8.scope42)
305                        u8.scope44 = bitutil.Advance(u8.scope43)
306
307                        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
308                        u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43
309               
310                        u8error1 = u8.xE0_scope & u8.x80_x9F
311                        u8error2 = u8.xED_scope & u8.xA0_xBF
312                        u8error3 = u8.xF0_scope & u8.x80_x8F
313                        u8error4 = u8.xF4_scope & u8.x90_xBF
314       
315                        u8.error |= u8error1 | u8error2 | u8error3 | u8error4
316
317                        EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
318
319                        u8.FFFE_FFFF = (EF_BF_pending & (u8.xBE | u8.xBF))
320        u8mismatch = u8anyscope ^ u8.suffix
321        u8.error |= u8mismatch
322       
323def Add_scope_streams(lex, scope1):
324        #scope1.LAngle = bitutil.Advance(lex.LAngle)
325        #scope1.Hyphen = bitutil.Advance(lex.Hyphen)
326        #scope1.QMark = bitutil.Advance(lex.QMark)
327        v = lex.LAngle | lex.Hyphen
328        w = lex.Hyphen | lex.QMark
329        v1 = bitutil.Advance(v)
330        w1 = bitutil.Advance(w)
331        scope1.LAngle = v1 &~ w1
332        scope1.Hyphen = v1 & w1
333        scope1.QMark = w1 &~ v1
334
335def Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, check_streams):
336        ctCDPI_Callouts.CD_end = 0
337        ctCDPI_Callouts.Ct_starts = 0
338        ctCDPI_Callouts.Ct_ends = 0
339        ctCDPI_Callouts.CD_starts = 0
340        ctCDPI_Callouts.CD_ends = 0
341        ctCDPI_Callouts.PI_starts = 0
342        ctCDPI_Callouts.PI_name_starts = 0
343        ctCDPI_Callouts.PI_name_ends = 0
344        ctCDPI_Callouts.PI_ends = 0
345        ctCDPI_Callouts.CtCDPI_mask = 0
346        ctCDPI_Callouts.error = 0
347        CtCDPI_starts = 0
348        Ct_errors = 0
349
350        if lex.RBracket:
351                scope1.RBracket = bitutil.Advance(lex.RBracket)
352                ctCDPI_Callouts.CD_end = bitutil.Advance(scope1.RBracket & lex.RBracket) & lex.RAngle
353        PI_start = scope1.LAngle & lex.QMark
354        CtCD_start = scope1.LAngle & lex.Exclam
355        CtCDPI_start = PI_start | CtCD_start
356
357        DoubleHyphen = scope1.Hyphen & lex.Hyphen
358        PI_end = scope1.QMark & lex.RAngle
359
360
361        #
362        # Initiate the scan
363        CtCDPI_Cursor = bitutil.ScanToFirst(CtCDPI_start)
364        while CtCDPI_Cursor:
365                CtCDPI_starts |= CtCDPI_Cursor
366                PI_Cursor = CtCDPI_Cursor & PI_start
367                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
368                CD_Cursor = CD_Ct_Cursor & lex.LBracket
369                Ct_Cursor = CD_Ct_Cursor & lex.Hyphen
370                ctCDPI_Callouts.PI_starts |= PI_Cursor
371                ctCDPI_Callouts.CD_starts |= CD_Cursor
372                ctCDPI_Callouts.Ct_starts |= Ct_Cursor
373                Ct_Cursor = bitutil.Advance(Ct_Cursor) 
374                Ct_errors |= Ct_Cursor & ~ lex.Hyphen
375                # Advance twice past <!--, so that we don't treat <!---
376                # as being a terminated comment.
377                Ct_Cursor = bitutil.Advance(bitutil.Advance(Ct_Cursor))
378                PI_Cursor = bitutil.Advance(PI_Cursor)
379                ctCDPI_Callouts.PI_name_starts |= PI_Cursor
380                PI_name_end = bitutil.ScanThru(PI_Cursor, lex.NameScan)
381                ctCDPI_Callouts.PI_name_ends |= PI_name_end
382                PI_Cursor = bitutil.ScanTo(PI_name_end, PI_end)
383                CD_Cursor = bitutil.ScanTo(CD_Cursor, ctCDPI_Callouts.CD_end)
384                Ct_Cursor = bitutil.Advance(bitutil.ScanTo(Ct_Cursor, DoubleHyphen))
385                ctCDPI_Callouts.PI_ends |= PI_Cursor
386                ctCDPI_Callouts.CD_ends |= CD_Cursor
387                ctCDPI_Callouts.Ct_ends |= Ct_Cursor
388                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
389                CtCDPI_Cursor = bitutil.ScanTo(CtCDPI_Cursor, CtCDPI_start)
390       
391                ctCDPI_Callouts.CtCDPI_mask = bitutil.Advance(ctCDPI_Callouts.CD_ends | ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - CtCDPI_starts             
392                #ctCDPI_Callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
393                ctCDPI_Callouts.error = Ct_errors | ctCDPI_Callouts.Ct_ends & ~lex.RAngle
394                ctCDPI_Callouts.error |= bitutil.Advance(ctCDPI_Callouts.PI_name_ends & ~ lex.WS) & ~ PI_end
395                ctCDPI_Callouts.error |= ctCDPI_Callouts.PI_name_starts & ctCDPI_Callouts.PI_name_ends
396                # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
397                ctCDPI_Callouts.error |= ctCDPI_Callouts.CtCDPI_mask &~ EOF_mask
398               
399        check_streams.misc_mask = (lex.WS | lex.LAngle | (bitutil.Advance(ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - (ctCDPI_Callouts.Ct_starts | ctCDPI_Callouts.PI_starts)) | CtCDPI_starts) & EOF_mask
400
401def Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts):
402
403       
404        # Delimiters for scans.
405        DQuoteDelim = lex.DQuote | lex.LAngle
406        SQuoteDelim = lex.SQuote | lex.LAngle
407        AttListDelim = lex.Slash | lex.RAngle
408       
409        # Start the parallel parsing by inspecting the character
410        # after the opening "<" of a tag.
411        tag_Callouts.LAngleFollow = scope1.LAngle &~ ctCDPI_Callouts.CtCDPI_mask
412        tag_Callouts.ElemName_starts = tag_Callouts.LAngleFollow & ~lex.Slash
413        tag_Callouts.EndTag_marks = tag_Callouts.LAngleFollow & lex.Slash
414       
415        # Start Tag/Empty Element Tag Parsing
416
417        # Advance all cursors by scanning through the tag name.
418        tag_Callouts.ElemName_ends = bitutil.ScanThru(tag_Callouts.ElemName_starts, lex.NameScan)
419        # Must have at least one name character for a legal start tag.
420        # Mark any occurrences of null names as errors.
421        ParseError = tag_Callouts.ElemName_starts & tag_Callouts.ElemName_ends
422       
423        # Initialize the accumulators for attribute name and value positions.
424        tag_Callouts.AttName_starts = 0 
425        tag_Callouts.AttName_ends = 0
426        EqToCheck = 0
427        tag_Callouts.AttVal_starts = 0
428        AttValEnds = 0
429        tag_Callouts.AttVal_ends = 0
430
431        # After the element name, there may or may not be an attlist.
432        AfterWS = bitutil.ScanThru(tag_Callouts.ElemName_ends, lex.WS)
433        AttListEnd = AfterWS & AttListDelim
434        AttNameStart = AfterWS & ~AttListDelim
435        # At least one WS character is required between ElemNames and AttNames.
436        ParseError |= tag_Callouts.ElemName_ends & AttNameStart
437
438        #
439        # The following loop iterates through attributes within a start tag.
440        # Because all start tags are processed in parallel, the number of
441        # iterations is the maximum number of attributes found in any one
442        # start tag, plus one.
443        while AttNameStart:
444                tag_Callouts.AttName_starts |= AttNameStart
445                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
446                tag_Callouts.AttName_ends |= AttNameFollow
447                # Scan through WS to the expected '=' delimiter.
448                EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
449                EqToCheck |= EqExpected
450                AttValPos = bitutil.ScanThru(EqExpected, EqExpected | lex.WS)
451                tag_Callouts.AttVal_starts |= AttValPos
452                DQuoteAttVal = AttValPos & lex.DQuote
453                SQuoteAttVal = AttValPos & lex.SQuote
454                DQuoteAttEnd = bitutil.ScanTo(DQuoteAttVal, DQuoteDelim &~ DQuoteAttVal)
455                SQuoteAttEnd = bitutil.ScanTo(SQuoteAttVal, SQuoteDelim &~ SQuoteAttVal)
456                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
457                AttValEnds |= AttValEnd
458                AttValFollow = bitutil.Advance(AttValEnd)
459                tag_Callouts.AttVal_ends |= AttValFollow
460                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
461                AttListEnd |= AfterWS & AttListDelim
462                AttNameStart = AfterWS & ~AttListDelim
463
464        # No more attribute values to process when AttNameStart == 0.
465        STagEnds = AttListEnd & lex.RAngle
466        # Mark any "/" characters found as the ends of empty element tags.
467        tag_Callouts.EmptyTag_marks = bitutil.Advance(AttListEnd & lex.Slash)
468       
469        # Check for errors.
470        ParseError |= tag_Callouts.AttVal_ends & tag_Callouts.AttName_starts # No intervening WS.
471        ParseError |= tag_Callouts.AttName_starts & tag_Callouts.AttName_ends # Null AttName
472        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
473        ParseError |= tag_Callouts.AttVal_starts & ~ (lex.DQuote | lex.SQuote)
474        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
475        ParseError |= tag_Callouts.EmptyTag_marks & ~lex.RAngle
476
477        # End Tag Parsing
478        EndTagEnds = bitutil.ScanThru(bitutil.ScanThru(tag_Callouts.EndTag_marks, tag_Callouts.EndTag_marks | lex.NameScan), lex.WS)
479        ParseError |= EndTagEnds & ~lex.RAngle
480        tag_Callouts.error = ParseError
481               
482        # Attribute value spans
483        tag_Callouts.AttVal_spans = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
484                       
485def Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts):
486        ref_Callouts.GenRef_starts = 0
487        ref_Callouts.GenRef_ends = 0
488        ref_Callouts.DecRef_starts = 0
489        ref_Callouts.DecRef_ends = 0
490        ref_Callouts.HexRef_starts = 0
491        ref_Callouts.HexRef_ends = 0
492        ref_Callouts.error = 0
493
494        Ref1 = lex.RefStart &~ ctCDPI_Callouts.CtCDPI_mask
495        # All remaining "&" must be reference start characters; parse them.
496        if Ref1:
497                scope1.RefStart = bitutil.Advance(Ref1)
498                NumRef2 = scope1.RefStart & lex.Hash
499                ref_Callouts.GenRef_starts = scope1.RefStart &~ lex.Hash
500                NumRef3 = bitutil.Advance(NumRef2)
501                HexRef3 = NumRef3 & lex.x
502                ref_Callouts.DecRef_starts = NumRef3 &~ lex.x
503                ref_Callouts.HexRef_starts = bitutil.Advance(HexRef3) 
504                ref_Callouts.GenRef_ends = bitutil.ScanThru(ref_Callouts.GenRef_starts, lex.NameScan)
505                ref_Callouts.DecRef_ends = bitutil.ScanThru(ref_Callouts.DecRef_starts, lex.Digit)
506                ref_Callouts.HexRef_ends = bitutil.ScanThru(ref_Callouts.HexRef_starts, lex.Hex)
507                # Error checks
508                # At least one digit required for DecRef, one hex digit for HexRef.
509                ref_error1 = ref_Callouts.DecRef_starts &~ lex.Digit
510                ref_error2 = ref_Callouts.HexRef_starts &~ lex.Hex
511                # Semicolon terminator required (also covers unterminated at EOF).
512                ref_ends = ref_Callouts.GenRef_ends | ref_Callouts.DecRef_ends | ref_Callouts.HexRef_ends
513                ref_error3 = ref_ends &~ lex.Semicolon
514                ref_Callouts.error = ref_error1 | ref_error2 | ref_error3
515
516def Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, check_streams):
517        PI_names = ctCDPI_Callouts.PI_name_ends - ctCDPI_Callouts.PI_name_starts
518        GenRefs = ref_Callouts.GenRef_ends - ref_Callouts.GenRef_starts
519        ElemNames = tag_Callouts.ElemName_ends - tag_Callouts.ElemName_starts
520        AttNames = tag_Callouts.AttName_ends - tag_Callouts.AttName_starts
521        qname_stream =  ElemNames | AttNames
522        ncname_stream = PI_names | GenRefs
523        name_stream = qname_stream | ncname_stream
524        name_start = name_stream &~ bitutil.Advance(name_stream)
525        name_cursor = name_stream & ~bitutil.Advance(name_stream)
526        void_prefix_err = name_cursor & lex.Colon
527        namespace_sep = bitutil.ScanThru(name_cursor, lex.NameScan &~ lex.Colon) & lex.Colon
528        local_part_start = bitutil.Advance(namespace_sep)
529        local_part_err = local_part_start &~ lex.NameScan
530        colon2_err = bitutil.ScanThru(local_part_start, lex.NameScan &~ lex.Colon) & lex.Colon
531        ncname_err = ncname_stream & lex.Colon
532        xml_names.namespace_error = void_prefix_err | local_part_err | colon2_err | ncname_err
533                       
534        check_streams.non_ascii_name_starts = name_start &~lex.ASCII_name_start
535        check_streams.non_ascii_names = (name_stream &~ name_start) & ~lex.ASCII_name_char & ~u8.suffix
536   
537def Do_check_streams(ctCDPI_Callouts, tag_Callouts, lex, u8, scope1, ref_Callouts, xml_names, check_streams):
538    # Ensure that no occurrence of ]]> occurs outside of markup.
539    CD_end_error = ctCDPI_Callouts.CD_end & ~(ctCDPI_Callouts.CtCDPI_mask | tag_Callouts.AttVal_spans)
540           
541    # Consolidate and check for errors
542    check_streams.error_mask = lex.error & EOF_mask | u8.error | u8.FFFE_FFFF | ctCDPI_Callouts.error | tag_Callouts.error | CD_end_error | ref_Callouts.error | xml_names.namespace_error
543
544    check_streams.tag_marks = tag_Callouts.EmptyTag_marks | tag_Callouts.LAngleFollow | tag_Callouts.AttName_starts
545    check_streams.name_follows = tag_Callouts.ElemName_ends | tag_Callouts.AttName_ends
546    check_streams.att_refs = tag_Callouts.AttVal_spans & scope1.RefStart
547
548def Form_Length_Group_Bitstreams(tag_Callouts):
549
550    remaining_starts = tag_Callouts.ElemName_starts
551    remaining_ends = tag_Callouts.ElemName_ends
552    temp = tag_Callouts.ElemName_starts
553    temp32 = bitutil.Advance32(temp)
554
555    # Group symbols of length 1
556    tag_Callouts.ElemName_ends_1 = interpose32(temp, temp32, 1) & remaining_ends
557    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_1
558
559    # Group symbols of length 2
560    tag_Callouts.ElemName_ends_2 = interpose32(temp, temp32, 2) & remaining_ends
561    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_2
562
563    # Group symbols of length 3
564    tag_Callouts.ElemName_ends_3 = interpose32(temp, temp32, 3) & remaining_ends
565    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_3
566
567    # Group symbols of length 4
568    tag_Callouts.ElemName_ends_4 = interpose32(temp, temp32, 4) & remaining_ends
569    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_4
570
571    # Group symbols of length 5
572    tag_Callouts.ElemName_ends_5 = interpose32(temp, temp32, 5) & remaining_ends
573    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_5
574
575    # Group symbols of length 6
576    tag_Callouts.ElemName_ends_6 = interpose32(temp, temp32, 6) & remaining_ends
577    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_6
578
579    # Group symbols of length 7
580    tag_Callouts.ElemName_ends_7 = interpose32(temp, temp32, 7) & remaining_ends
581    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_7
582
583    # Group symbols of length 8
584    tag_Callouts.ElemName_ends_8 = interpose32(temp, temp32, 8) & remaining_ends
585    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_8
586
587    # Group symbols of length 9
588    tag_Callouts.ElemName_ends_9 = interpose32(temp, temp32, 9) & remaining_ends
589    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_9
590
591    # Group symbols of length 10
592    tag_Callouts.ElemName_ends_10 = interpose32(temp, temp32, 10) & remaining_ends
593    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_10
594
595    # Group symbols of length 11
596    tag_Callouts.ElemName_ends_11 = interpose32(temp, temp32, 11) & remaining_ends
597    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_11
598
599    # Group symbols of length 12
600    tag_Callouts.ElemName_ends_12 = interpose32(temp, temp32, 12) & remaining_ends
601    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_12
602
603    # Group symbols of length 13
604    tag_Callouts.ElemName_ends_13 = interpose32(temp, temp32, 13) & remaining_ends
605    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_13
606
607    # Group symbols of length 14
608    tag_Callouts.ElemName_ends_14 = interpose32(temp, temp32, 14) & remaining_ends
609    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_14
610
611    # Group symbols of length 15
612    temp15 = interpose32(temp, temp32, 15)
613    tag_Callouts.ElemName_ends_15 = temp15 & remaining_ends
614    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_15
615
616    # Group symbols of length 16
617    temp = bitutil.Advance(temp15)
618    tag_Callouts.ElemName_ends_16 = temp & remaining_ends
619    remaining_ends = remaining_ends & ~tag_Callouts.ElemName_ends_16
620
621    # Group symbols of length 17 and longer
622    tag_Callouts.ElemName_ends_17_and_longer = remaining_ends
623
624
625def Compute_Hash_Value_Bitstream(hash_data, basis_bits):
626    hash_data.Hash_value = basis_bits.bit_2 ^ basis_bits.bit_4 ^ basis_bits.bit_6
627    #hash_data.Hash_value = basis_bits.bit_3 ^ basis_bits.bit_5 ^ basis_bits.bit_7
628
629#def main(basis_bits, lex, u8, scope1, ctCDPI_Callouts, masks, check_streams, tag_Callouts, ref_Callouts, xml_names):
630def Main(basis_bits, lex, u8, xml_char, scope1, ctCDPI_Callouts, ref_Callouts, tag_Callouts, masks, xml_names, check_streams, hash_data):
631       
632        # Classify bytes for UTF-8 processing, whitespace and control
633        # processing and XML lexical analysis.
634        # Classify_bytes(basis_bits, lex)
635
636        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams
637        # Validate_utf8(basis_bits, u8)
638                               
639        Classify_bytes_Validate_utf8(basis_bits, lex, u8)
640
641        Add_scope_streams(lex, scope1)
642   
643        # Parse all comments, CDATA sections and processing instructions.
644        Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, check_streams)
645               
646        # All remaining '<' must be tag start characters; parse tags.
647        Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts) 
648
649        # All remaining '&' must be reference start characters; parse them.
650        Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts)
651       
652        # Validate XML namespaces and generate bit streams to post validate non-ascii range XML names
653        Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, check_streams)
654   
655   
656        Do_check_streams(ctCDPI_Callouts, tag_Callouts, lex, u8, scope1, ref_Callouts, xml_names, check_streams)
657
658        # These methods are needed to do Paralel Bitstream Based Length Sorting
659        Form_Length_Group_Bitstreams(tag_Callouts)
660
661        Compute_Hash_Value_Bitstream(hash_data, basis_bits);
662               
Note: See TracBrowser for help on using the repository browser.