source: proto/SymbolTable/parabix2_symtab_pbs_div.py @ 1387

Last change on this file since 1387 was 1387, checked in by vla24, 8 years ago

Symbol Table: Implemented division by 2 grouping strategy

File size: 21.3 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# parabix2_compilable.py
4#
5# Parallel XML Parsing with Bitstream Addition
6#
7# - Complete prototype for all bitstream computations in Parabix2
8# - Optimized for compilation
9# - Separate compilation
10
11# Robert D. Cameron
12# July 29, 2010
13#
14
15#import bitutil
16
17class u8 ():
18  unibyte = 0
19  prefix = 0
20  prefix2 = 0
21  prefix3 = 0
22  prefix4 = 0
23  suffix = 0
24  badprefix = 0
25  xE0 = 0
26  xED = 0
27  xF0 = 0
28  xF4 = 0
29  xA0_xBF = 0
30  x80_x9F = 0
31  x90_xBF = 0
32  x80_x8F = 0
33  xEF = 0
34  xBF = 0
35  xBE = 0
36  scope22 = 0
37  scope32 = 0
38  scope33 = 0
39  scope42 = 0
40  scope43 = 0
41  scope44 = 0
42  xE0_scope = 0
43  xED_scope = 0
44  xF0_scope = 0
45  xF4_scope = 0
46  xEF_scope = 0
47
48class Lex ():
49        CR = 0
50        LF = 0
51        HT = 0
52        SP = 0
53        CRLF = 0
54        RefStart = 0
55        Semicolon = 0
56        Colon = 0
57        LAngle = 0
58        RAngle = 0
59        LBracket = 0
60        RBracket = 0
61        Exclam = 0
62        QMark = 0
63        Hyphen = 0
64        Equals = 0
65        SQuote = 0
66        DQuote = 0
67        Slash = 0
68        Hash = 0
69        x = 0
70        ASCII_name_start = 0
71        ASCII_name_char = 0
72        NameScan = 0
73        Digit = 0
74        Hex = 0
75        WS = 0
76
77class Scope1 ():
78        RefStart = 0
79        LAngle = 0
80        Hyphen = 0
81        QMark = 0
82        RBracket = 0
83
84class CtCDPI_Callouts():
85        CD_end = 0
86        Ct_starts = 0
87        Ct_ends = 0
88        CD_starts = 0
89        CD_ends = 0
90        PI_starts = 0
91        PI_name_starts = 0
92        PI_name_ends = 0
93        PI_ends = 0
94        CtCDPI_mask = 0
95
96class Ref_Callouts():
97        GenRef_starts = 0
98        GenRef_ends = 0
99        DecRef_starts = 0
100        DecRef_ends = 0
101        HexRef_starts = 0
102        HexRef_ends = 0
103
104class Hash_data():
105        Hash_value = 0
106
107class Tag_Callouts():
108        ElemName_starts = 0
109        ElemName_ends = 0
110        ElemName_ends_1_to_2 = 0
111        ElemName_ends_3_to_4 = 0
112        ElemName_ends_5_to_6 = 0
113        ElemName_ends_7_to_8 = 0
114        ElemName_ends_9_to_10 = 0
115        ElemName_ends_11_to_12 = 0
116        ElemName_ends_13_to_14 = 0
117        ElemName_ends_15_to_16 = 0
118        ElemName_remaining_ends = 0
119        AttName_starts = 0
120        AttName_ends = 0
121        AttVal_starts = 0
122        AttVal_ends = 0
123        AttVal_spans = 0
124        EmptyTag_marks = 0
125        EndTag_marks = 0
126
127class Basis_bits():
128        bit_0 = 0
129        bit_1 = 0
130        bit_2 = 0
131        bit_3 = 0
132        bit_4 = 0
133        bit_5 = 0
134        bit_6 = 0
135        bit_7 = 0
136
137class Check_streams():
138        misc_mask = 0
139        non_ascii_name_starts = 0
140        non_ascii_names = 0
141        tag_marks = 0
142        name_follows = 0
143        att_refs = 0
144
145class Xml_names():
146        namespace_error = 0
147
148def Classify_bytes_Validate_utf8(basis_bits, lex, u8):
149        temp1 = (basis_bits.bit_0 | basis_bits.bit_1);
150        temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3);
151        temp3 = (temp2 &~ temp1);
152        temp4 = (basis_bits.bit_5 &~ basis_bits.bit_4);
153        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7);
154        temp6 = (temp4 & temp5);
155        lex.RefStart = (temp3 & temp6);
156        temp7 = (basis_bits.bit_2 & basis_bits.bit_3);
157        temp8 = (temp7 &~ temp1);
158        temp9 = (basis_bits.bit_4 &~ basis_bits.bit_5);
159        temp10 = (basis_bits.bit_6 & basis_bits.bit_7);
160        temp11 = (temp9 & temp10);
161        lex.Semicolon = (temp8 & temp11);
162        temp12 = (basis_bits.bit_4 & basis_bits.bit_5);
163        temp13 = (basis_bits.bit_6 | basis_bits.bit_7);
164        temp14 = (temp12 &~ temp13);
165        lex.LAngle = (temp8 & temp14);
166        temp15 = (temp12 & temp5);
167        lex.RAngle = (temp8 & temp15);
168        temp16 = (basis_bits.bit_1 &~ basis_bits.bit_0);
169        temp17 = (basis_bits.bit_3 &~ basis_bits.bit_2);
170        temp18 = (temp16 & temp17);
171        lex.LBracket = (temp18 & temp11);
172        temp19 = (basis_bits.bit_7 &~ basis_bits.bit_6);
173        temp20 = (temp12 & temp19);
174        lex.RBracket = (temp18 & temp20);
175        temp21 = (basis_bits.bit_4 | basis_bits.bit_5);
176        temp22 = (temp19 &~ temp21);
177        lex.Exclam = (temp3 & temp22);
178        temp23 = (temp12 & temp10);
179        lex.QMark = (temp8 & temp23);
180        lex.Hyphen = (temp3 & temp20);
181        lex.Equals = (temp8 & temp20);
182        temp24 = (temp4 & temp10);
183        lex.SQuote = (temp3 & temp24);
184        temp25 = (temp5 &~ temp21);
185        lex.DQuote = (temp3 & temp25);
186        lex.Slash = (temp3 & temp23);
187        temp26 = (temp10 &~ temp21);
188        lex.Hash = (temp3 & temp26);
189        temp27 = (temp16 & temp7);
190        temp28 = (temp9 &~ temp13);
191        lex.x = (temp27 & temp28);
192        temp29 = (temp9 & temp5);
193        lex.Colon = (temp8 & temp29);
194        temp30 = (temp18 & temp23);
195        temp31 = (temp30 | lex.Colon);
196        temp32 = (temp16 &~ basis_bits.bit_2);
197        temp33 = (basis_bits.bit_5 | temp10);
198        temp34 = (basis_bits.bit_4 & temp33);
199        temp35 = (~temp34);
200        temp36 = (temp21 | temp13);
201        temp37 = ((basis_bits.bit_3 & temp35)|(~(basis_bits.bit_3) & temp36));
202        temp38 = (temp32 & temp37);
203        temp39 = (temp31 | temp38);
204        temp40 = (temp16 & basis_bits.bit_2);
205        temp41 = (temp40 & temp37);
206        lex.ASCII_name_start = (temp39 | temp41);
207        temp42 = (temp30 | lex.Hyphen);
208        temp43 = (temp3 & temp15);
209        temp44 = (temp42 | temp43);
210        temp45 = (temp8 &~ temp34);
211        temp46 = (temp44 | temp45);
212        temp47 = (temp46 | temp38);
213        lex.ASCII_name_char = (temp47 | temp41);
214        lex.NameScan = (lex.ASCII_name_char | basis_bits.bit_0);
215        temp48 = (temp1 | basis_bits.bit_2);
216        x00_x1F = (~temp48);
217        temp49 = (basis_bits.bit_2 | basis_bits.bit_3);
218        temp50 = (temp1 | temp49);
219        lex.CR = (temp20 &~ temp50);
220        lex.LF = (temp29 &~ temp50);
221        temp51 = (temp9 & temp19);
222        lex.HT = (temp51 &~ temp50);
223        lex.SP = (temp3 &~ temp36);
224        temp52 = (temp20 | temp29);
225        temp53 = (temp52 | temp51);
226        temp54 = (temp53 &~ temp50);
227        lex.WS = (temp54 | lex.SP);
228        temp55 = (basis_bits.bit_5 | basis_bits.bit_6);
229        temp56 = (basis_bits.bit_4 & temp55);
230        lex.Digit = (temp8 &~ temp56);
231        temp57 = (temp16 &~ temp49);
232        temp58 = (temp57 &~ basis_bits.bit_4);
233        temp59 = (~temp10);
234        temp60 = ((basis_bits.bit_5 & temp59)|(~(basis_bits.bit_5) & temp13));
235        temp61 = (temp58 & temp60);
236        temp62 = (lex.Digit | temp61);
237        temp63 = (temp16 & temp2);
238        temp64 = (temp63 &~ basis_bits.bit_4);
239        temp65 = (temp64 & temp60);
240        lex.Hex = (temp62 | temp65);
241        lex_error = x00_x1F &~ lex.WS
242        if lex_error & EOF_mask:
243                error_tracker.NoteError("Error: illegal character", lex_error)
244
245
246        ### Validate_utf8(basis_bits, u8):
247        u8.unibyte = (~basis_bits.bit_0);
248        u8.suffix = 0
249        u8_error = 0
250        u8_FFFE_FFFF = 0
251        u8anyscope = 0 #local
252        if basis_bits.bit_0:
253                u8.prefix = (basis_bits.bit_0 & basis_bits.bit_1);
254                u8.prefix2 = (u8.prefix &~ basis_bits.bit_2);
255                u8.prefix3 = (u8.prefix & temp2);
256                u8.prefix4 = (u8.prefix & temp7);
257                u8.suffix = (basis_bits.bit_0 &~ basis_bits.bit_1);
258                temp66 = (u8.prefix &~ temp49);
259                temp67 = (temp21 | basis_bits.bit_6);
260                temp68 = (temp66 &~ temp67);
261                temp69 = (basis_bits.bit_5 & temp13);
262                temp70 = (basis_bits.bit_4 | temp69);
263                temp71 = (u8.prefix4 & temp70);
264                u8.badprefix = (temp68 | temp71);
265                u8_error = u8.badprefix
266                u8.scope22 = bitutil.Advance(u8.prefix2)
267                u8anyscope = u8.scope22
268                if u8.prefix3 | u8.prefix4:
269                        xE0 = (u8.prefix3 &~ temp36);
270                        xED = (u8.prefix3 & temp20);
271                        xF0 = (u8.prefix4 &~ temp36);
272                        temp72 = (temp4 &~ temp13);
273                        xF4 = (u8.prefix4 & temp72);
274                        u8.xA0_xBF = (u8.suffix & basis_bits.bit_2);
275                        u8.x80_x9F = (u8.suffix &~ basis_bits.bit_2);
276                        u8.x90_xBF = (u8.suffix & temp49);
277                        u8.x80_x8F = (u8.suffix &~ temp49);
278                        xEF = (u8.prefix3 & temp23);
279                        temp73 = (u8.suffix & temp7);
280                        u8.xBF = (temp73 & temp23);
281                        u8.xBE = (temp73 & temp15);
282                        u8.xE0_scope = bitutil.Advance(xE0);
283                        u8.xED_scope = bitutil.Advance(xED);
284                        u8.xF0_scope = bitutil.Advance(xF0);
285                        u8.xF4_scope = bitutil.Advance(xF4);
286                        u8.xEF_scope = bitutil.Advance(xEF);
287                        u8.scope32 = bitutil.Advance(u8.prefix3)
288                        u8.scope33 = bitutil.Advance(u8.scope32)
289                        u8.scope42 = bitutil.Advance(u8.prefix4)
290                        u8.scope43 = bitutil.Advance(u8.scope42)
291                        u8.scope44 = bitutil.Advance(u8.scope43)
292
293                        u8lastscope = u8.scope22 | u8.scope33 | u8.scope44
294                        u8anyscope = u8lastscope | u8.scope32 | u8.scope42 | u8.scope43
295
296                        u8error1 = u8.xE0_scope & u8.x80_x9F
297                        u8error2 = u8.xED_scope & u8.xA0_xBF
298                        u8error3 = u8.xF0_scope & u8.x80_x8F
299                        u8error4 = u8.xF4_scope & u8.x90_xBF
300
301                        u8_error |= u8error1 | u8error2 | u8error3 | u8error4
302
303                        EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
304
305                        u8_FFFE_FFFF = (EF_BF_pending & (u8.xBE | u8.xBF))
306                u8mismatch = u8anyscope ^ u8.suffix
307                u8_error |= u8mismatch | u8_FFFE_FFFF
308                if u8_error:
309                        error_tracker.NoteError("UTF-8 error found", (u8_error))
310
311
312def Add_scope_streams(lex, scope1):
313        #scope1.LAngle = bitutil.Advance(lex.LAngle)
314        #scope1.Hyphen = bitutil.Advance(lex.Hyphen)
315        #scope1.QMark = bitutil.Advance(lex.QMark)
316        v = lex.LAngle | lex.Hyphen
317        w = lex.Hyphen | lex.QMark
318        v1 = bitutil.Advance(v)
319        w1 = bitutil.Advance(w)
320        scope1.LAngle = v1 &~ w1
321        scope1.Hyphen = v1 & w1
322        scope1.QMark = w1 &~ v1
323        scope1.RefStart = 0 # default
324
325def Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, check_streams):
326        ctCDPI_Callouts.CD_end = 0
327        ctCDPI_Callouts.Ct_starts = 0
328        ctCDPI_Callouts.Ct_ends = 0
329        ctCDPI_Callouts.CD_starts = 0
330        ctCDPI_Callouts.CD_ends = 0
331        ctCDPI_Callouts.PI_starts = 0
332        ctCDPI_Callouts.PI_name_starts = 0
333        ctCDPI_Callouts.PI_name_ends = 0
334        ctCDPI_Callouts.PI_ends = 0
335        ctCDPI_Callouts.CtCDPI_mask = 0
336        ctCDPI_error = 0
337        CtCDPI_starts = 0
338        Ct_errors = 0
339
340        if lex.RBracket:
341                scope1.RBracket = bitutil.Advance(lex.RBracket)
342                ctCDPI_Callouts.CD_end = bitutil.Advance(scope1.RBracket & lex.RBracket) & lex.RAngle
343        PI_start = scope1.LAngle & lex.QMark
344        CtCD_start = scope1.LAngle & lex.Exclam
345        CtCDPI_start = PI_start | CtCD_start
346
347        DoubleHyphen = scope1.Hyphen & lex.Hyphen
348        PI_end = scope1.QMark & lex.RAngle
349
350
351        #
352        # Initiate the scan
353        CtCDPI_Cursor = bitutil.ScanToFirst(CtCDPI_start)
354        while CtCDPI_Cursor:
355                CtCDPI_starts |= CtCDPI_Cursor
356                PI_Cursor = CtCDPI_Cursor & PI_start
357                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
358                CD_Cursor = CD_Ct_Cursor & lex.LBracket
359                Ct_Cursor = CD_Ct_Cursor & lex.Hyphen
360                ctCDPI_Callouts.PI_starts |= PI_Cursor
361                ctCDPI_Callouts.CD_starts |= CD_Cursor
362                ctCDPI_Callouts.Ct_starts |= Ct_Cursor
363                Ct_Cursor = bitutil.Advance(Ct_Cursor)
364                Ct_errors |= Ct_Cursor & ~ lex.Hyphen
365                # Advance twice past <!--, so that we don't treat <!---
366                # as being a terminated comment.
367                Ct_Cursor = bitutil.Advance(bitutil.Advance(Ct_Cursor))
368                PI_Cursor = bitutil.Advance(PI_Cursor)
369                ctCDPI_Callouts.PI_name_starts |= PI_Cursor
370                PI_name_end = bitutil.ScanThru(PI_Cursor, lex.NameScan)
371                ctCDPI_Callouts.PI_name_ends |= PI_name_end
372                PI_Cursor = bitutil.ScanTo(PI_name_end, PI_end)
373                CD_Cursor = bitutil.ScanTo(CD_Cursor, ctCDPI_Callouts.CD_end)
374                Ct_Cursor = bitutil.Advance(bitutil.ScanTo(Ct_Cursor, DoubleHyphen))
375                ctCDPI_Callouts.PI_ends |= PI_Cursor
376                ctCDPI_Callouts.CD_ends |= CD_Cursor
377                ctCDPI_Callouts.Ct_ends |= Ct_Cursor
378                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
379                CtCDPI_Cursor = bitutil.ScanTo(CtCDPI_Cursor, CtCDPI_start)
380
381                ctCDPI_Callouts.CtCDPI_mask = bitutil.Advance(ctCDPI_Callouts.CD_ends | ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - CtCDPI_starts
382                #ctCDPI_Callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
383                ctCDPI_error = Ct_errors | ctCDPI_Callouts.Ct_ends & ~lex.RAngle
384                ctCDPI_error |= bitutil.Advance(ctCDPI_Callouts.PI_name_ends & ~ lex.WS) & ~ PI_end
385                ctCDPI_error |= ctCDPI_Callouts.PI_name_starts & ctCDPI_Callouts.PI_name_ends
386                # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
387                ctCDPI_error |= ctCDPI_Callouts.CtCDPI_mask &~ EOF_mask
388
389        if ctCDPI_error:
390                error_tracker.NoteError("Error in comment, CDATA or processing instruction syntax", ctCDPI_error)
391
392        check_streams.misc_mask = (lex.WS | lex.LAngle | (bitutil.Advance(ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - (ctCDPI_Callouts.Ct_starts | ctCDPI_Callouts.PI_starts)) | CtCDPI_starts) & EOF_mask
393
394def Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts):
395
396
397        # Delimiters for scans.
398        DQuoteDelim = lex.DQuote | lex.LAngle
399        SQuoteDelim = lex.SQuote | lex.LAngle
400        AttListDelim = lex.Slash | lex.RAngle
401
402        # Start the parallel parsing by inspecting the character
403        # after the opening "<" of a tag.
404        LAngleFollow = scope1.LAngle &~ ctCDPI_Callouts.CtCDPI_mask
405        tag_Callouts.ElemName_starts = LAngleFollow & ~lex.Slash
406        tag_Callouts.EndTag_marks = LAngleFollow & lex.Slash
407
408        # Start Tag/Empty Element Tag Parsing
409
410        # Advance all cursors by scanning through the tag name.
411        tag_Callouts.ElemName_ends = bitutil.ScanThru(tag_Callouts.ElemName_starts, lex.NameScan)
412        # Must have at least one name character for a legal start tag.
413        # Mark any occurrences of null names as errors.
414        ParseError = tag_Callouts.ElemName_starts & tag_Callouts.ElemName_ends
415
416        # Initialize the accumulators for attribute name and value positions.
417        tag_Callouts.AttName_starts = 0
418        tag_Callouts.AttName_ends = 0
419        EqToCheck = 0
420        tag_Callouts.AttVal_starts = 0
421        AttValEnds = 0
422        tag_Callouts.AttVal_ends = 0
423
424        # After the element name, there may or may not be an attlist.
425        AfterWS = bitutil.ScanThru(tag_Callouts.ElemName_ends, lex.WS)
426        AttListEnd = AfterWS & AttListDelim
427        AttNameStart = AfterWS & ~AttListDelim
428        # At least one WS character is required between ElemNames and AttNames.
429        ParseError |= tag_Callouts.ElemName_ends & AttNameStart
430
431        #
432        # The following loop iterates through attributes within a start tag.
433        # Because all start tags are processed in parallel, the number of
434        # iterations is the maximum number of attributes found in any one
435        # start tag, plus one.
436        while AttNameStart:
437                tag_Callouts.AttName_starts |= AttNameStart
438                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
439                tag_Callouts.AttName_ends |= AttNameFollow
440                # Scan through WS to the expected '=' delimiter.
441                # EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
442                # But use if test to optimize.
443                if AttNameFollow & lex.WS:
444                        EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
445                else: EqExpected = AttNameFollow
446                EqToCheck |= EqExpected
447                AttValPos = bitutil.ScanThru(EqExpected, EqExpected | lex.WS)
448                tag_Callouts.AttVal_starts |= AttValPos
449                DQuoteAttVal = AttValPos & lex.DQuote
450                SQuoteAttVal = AttValPos & lex.SQuote
451                DQuoteAttEnd = bitutil.ScanTo(DQuoteAttVal, DQuoteDelim &~ DQuoteAttVal)
452                SQuoteAttEnd = bitutil.ScanTo(SQuoteAttVal, SQuoteDelim &~ SQuoteAttVal)
453                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
454                AttValEnds |= AttValEnd
455                AttValFollow = bitutil.Advance(AttValEnd)
456                tag_Callouts.AttVal_ends |= AttValFollow
457                #  AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
458                if AttValFollow & lex.WS:
459                        AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
460                else: AfterWS = AttValFollow
461                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
462                AttListEnd |= AfterWS & AttListDelim
463                AttNameStart = AfterWS & ~AttListDelim
464
465        # No more attribute values to process when AttNameStart == 0.
466        STagEnds = AttListEnd & lex.RAngle
467        # Mark any "/" characters found as the ends of empty element tags.
468        tag_Callouts.EmptyTag_marks = bitutil.Advance(AttListEnd & lex.Slash)
469
470        # Check for errors.
471        ParseError |= tag_Callouts.AttVal_ends & tag_Callouts.AttName_starts # No intervening WS.
472        ParseError |= tag_Callouts.AttName_starts & tag_Callouts.AttName_ends # Null AttName
473        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
474        ParseError |= tag_Callouts.AttVal_starts & ~ (lex.DQuote | lex.SQuote)
475        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
476        ParseError |= tag_Callouts.EmptyTag_marks & ~lex.RAngle
477
478        # End Tag Parsing
479
480        EndTagEnds = bitutil.ScanThru(tag_Callouts.EndTag_marks, tag_Callouts.EndTag_marks | lex.NameScan)
481        if EndTagEnds & lex.WS:
482                EndTagEnds = bitutil.ScanThru(EndTagEnds, lex.WS)
483        ParseError |= EndTagEnds & ~lex.RAngle
484        if ParseError:
485                error_tracker.NoteError("Tag parsing error found", (ParseError))
486
487
488        # Attribute value spans
489        tag_Callouts.AttVal_spans = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
490
491def Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts):
492        ref_Callouts.GenRef_starts = 0
493        ref_Callouts.GenRef_ends = 0
494        ref_Callouts.DecRef_starts = 0
495        ref_Callouts.DecRef_ends = 0
496        ref_Callouts.HexRef_starts = 0
497        ref_Callouts.HexRef_ends = 0
498        ref_error = 0
499
500        Ref1 = lex.RefStart &~ ctCDPI_Callouts.CtCDPI_mask
501        # All remaining "&" must be reference start characters; parse them.
502        if Ref1:
503                scope1.RefStart = bitutil.Advance(Ref1)
504                NumRef2 = scope1.RefStart & lex.Hash
505                ref_Callouts.GenRef_starts = scope1.RefStart &~ lex.Hash
506                NumRef3 = bitutil.Advance(NumRef2)
507                HexRef3 = NumRef3 & lex.x
508                ref_Callouts.DecRef_starts = NumRef3 &~ lex.x
509                ref_Callouts.HexRef_starts = bitutil.Advance(HexRef3)
510                ref_Callouts.GenRef_ends = bitutil.ScanThru(ref_Callouts.GenRef_starts, lex.NameScan)
511                ref_Callouts.DecRef_ends = bitutil.ScanThru(ref_Callouts.DecRef_starts, lex.Digit)
512                ref_Callouts.HexRef_ends = bitutil.ScanThru(ref_Callouts.HexRef_starts, lex.Hex)
513                # Error checks
514                # At least one digit required for DecRef, one hex digit for HexRef.
515                ref_error1 = ref_Callouts.DecRef_starts &~ lex.Digit
516                ref_error2 = ref_Callouts.HexRef_starts &~ lex.Hex
517                # Semicolon terminator required (also covers unterminated at EOF).
518                ref_ends = ref_Callouts.GenRef_ends | ref_Callouts.DecRef_ends | ref_Callouts.HexRef_ends
519                ref_error3 = ref_ends &~ lex.Semicolon
520                ref_error = ref_error1 | ref_error2 | ref_error3
521                if ref_error:
522                        error_tracker.NoteError("Reference error found", (ref_error))
523
524
525
526def Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, check_streams):
527        PI_names = ctCDPI_Callouts.PI_name_ends - ctCDPI_Callouts.PI_name_starts
528        GenRefs = ref_Callouts.GenRef_ends - ref_Callouts.GenRef_starts
529        ElemNames = tag_Callouts.ElemName_ends - tag_Callouts.ElemName_starts
530        AttNames = tag_Callouts.AttName_ends - tag_Callouts.AttName_starts
531        qname_stream =  ElemNames | AttNames
532        ncname_stream = PI_names | GenRefs
533        name_stream = qname_stream | ncname_stream
534        name_start = name_stream &~ bitutil.Advance(name_stream)
535        name_cursor = name_stream & ~bitutil.Advance(name_stream)
536        void_prefix_err = name_cursor & lex.Colon
537        namespace_sep = bitutil.ScanThru(name_cursor, lex.NameScan &~ lex.Colon) & lex.Colon
538        local_part_start = bitutil.Advance(namespace_sep)
539        local_part_err = local_part_start &~ lex.NameScan
540        colon2_err = bitutil.ScanThru(local_part_start, lex.NameScan &~ lex.Colon) & lex.Colon
541        ncname_err = ncname_stream & lex.Colon
542        namespace_error = void_prefix_err | local_part_err | colon2_err | ncname_err
543        if namespace_error:
544                error_tracker.NoteError("error found", namespace_error)
545
546
547        check_streams.non_ascii_name_starts = name_start &~lex.ASCII_name_start
548        check_streams.non_ascii_names = (name_stream &~ name_start) & ~lex.ASCII_name_char & ~u8.suffix
549
550def Do_check_streams(ctCDPI_Callouts, tag_Callouts, lex, u8, scope1, ref_Callouts, xml_names, check_streams):
551    # Ensure that no occurrence of ]]> occurs outside of markup.
552    CD_end_error = ctCDPI_Callouts.CD_end & ~(ctCDPI_Callouts.CtCDPI_mask | tag_Callouts.AttVal_spans)
553
554    # Consolidate and check for errors
555    if CD_end_error:
556                error_tracker.NoteError("Error: ]]> in text", CD_end_error)
557
558
559    check_streams.tag_marks = tag_Callouts.EmptyTag_marks | tag_Callouts.ElemName_starts | tag_Callouts.EndTag_marks | tag_Callouts.AttName_starts
560    check_streams.name_follows = tag_Callouts.ElemName_ends | tag_Callouts.AttName_ends
561    check_streams.att_refs = tag_Callouts.AttVal_spans & scope1.RefStart
562
563def Form_Length_Group_Bitstreams(tag_Callouts):
564
565    starts = tag_Callouts.ElemName_starts
566    ends = tag_Callouts.ElemName_ends
567
568    temp = ends | bitutil.Advance(ends)
569
570    # Group symbols of length 1 and 2
571    start_2 = bitutil.Advance(bitutil.Advance(starts))
572    tag_Callouts.ElemName_ends_1_to_2 =  start_2 & temp
573
574    # Group symbols of length 3 and 4
575    start_4 = bitutil.Advance(bitutil.Advance(start_2 & ~temp))
576    tag_Callouts.ElemName_ends_3_to_4 =  start_4 & temp
577
578    # Group symbols of length 5 and 6
579    start_6 = bitutil.Advance(bitutil.Advance(start_4 & ~temp))
580    tag_Callouts.ElemName_ends_5_to_6 =  start_6 & temp
581
582    # Group symbols of length 7 and 8
583    start_8 = bitutil.Advance(bitutil.Advance(start_6 & ~temp))
584    tag_Callouts.ElemName_ends_7_to_8 =  start_8 & temp
585
586    # Group symbols of length 9 and 10
587    start_10 = bitutil.Advance(bitutil.Advance(start_8 & ~temp))
588    tag_Callouts.ElemName_ends_9_to_10 =  start_10 & temp
589
590    # Group symbols of length 11 and 12
591    start_12 = bitutil.Advance(bitutil.Advance(start_10 & ~temp))
592    tag_Callouts.ElemName_ends_11_to_12 =  start_12 & temp
593
594    # Group symbols of length 13 and 14
595    start_14 = bitutil.Advance(bitutil.Advance(start_12 & ~temp))
596    tag_Callouts.ElemName_ends_13_to_14 =  start_14 & temp
597
598    # Group symbols of length 15 and 16
599    start_16 = bitutil.Advance(bitutil.Advance(start_14 & ~temp))
600    tag_Callouts.ElemName_ends_15_to_16 =  start_16 & temp
601
602    # Group symbols of length 17 and longer
603    tag_Callouts.ElemName_remaining_ends = start_16 & ~tag_Callouts.ElemName_ends_15_to_16
604
605
606def Compute_Hash_Value_Bitstream(hash_data, basis_bits):
607    hash_data.Hash_value = basis_bits.bit_2 ^ basis_bits.bit_4 ^ basis_bits.bit_6
608    #hash_data.Hash_value = basis_bits.bit_3 ^ basis_bits.bit_5 ^ basis_bits.bit_7
609
610def Main(basis_bits, lex, u8, xml_char, scope1, ctCDPI_Callouts, ref_Callouts, tag_Callouts, masks, xml_names, check_streams, hash_data):
611
612        # Classify bytes for UTF-8 processing, whitespace and control
613        # processing and XML lexical analysis.
614        # Classify_bytes(basis_bits, lex)
615
616        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams
617        # Validate_utf8(basis_bits, u8)
618
619        Classify_bytes_Validate_utf8(basis_bits, lex, u8)
620
621        Add_scope_streams(lex, scope1)
622
623        # Parse all comments, CDATA sections and processing instructions.
624        Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, check_streams)
625
626        # All remaining '<' must be tag start characters; parse tags.
627        Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts)
628
629        # All remaining '&' must be reference start characters; parse them.
630        Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts)
631
632        # Validate XML namespaces and generate bit streams to post validate non-ascii range XML names
633        Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, check_streams)
634
635
636        Do_check_streams(ctCDPI_Callouts, tag_Callouts, lex, u8, scope1, ref_Callouts, xml_names, check_streams)
637
638        # These methods are needed to do Paralel Bitstream Based Length Sorting
639        Form_Length_Group_Bitstreams(tag_Callouts)
640
641        Compute_Hash_Value_Bitstream(hash_data, basis_bits);
Note: See TracBrowser for help on using the repository browser.