source: proto/Xerces/parabix_xerces.py @ 1217

Last change on this file since 1217 was 1217, checked in by cameron, 8 years ago

parabix_xerces.py Pablo code for XML Reader

File size: 11.3 KB
Line 
1#
2# Parallel XML Parsing with Bitstream Addition
3#
4# Version for Xerces XML Reader
5#   - assumes:  input transcoded to UTF-16,
6#               line break normalization applied,
7#               pseudo-ASCII bit streams
8#
9# Robert D. Cameron
10
11class Lex():
12        RefStart = 0
13        Semicolon = 0 
14        Colon = 0
15        LAngle = 0
16        RAngle = 0
17        LBracket = 0
18        RBracket = 0
19        Exclam = 0
20        QMark = 0
21        Hyphen = 0
22        Equals = 0
23        SQuote = 0
24        DQuote = 0
25        Slash = 0
26        Hash = 0
27        x = 0
28        ASCII_name_start = 0
29        ASCII_name_char = 0
30        NameScan = 0
31        Digit = 0
32        Hex = 0
33        WS = 0
34
35class Control():
36# The delmask marks bit stream positions that should be
37# considered deleted, i.e., positions at no corresponding
38# character data should be produced.
39        delmask = 0
40        XML_error = 0  # 0x0-0x1F except HT, LF CR, 0xFFFE, 0xFFFF
41        ch_error = 0   # Bad UTF8 characters
42        normalized_LF = 0
43        HT = 0
44        pseudo_bit0 = 0
45
46class Scope1 ():
47        RefStart = 0
48        LAngle = 0
49        Hyphen = 0
50        QMark = 0
51        RBracket = 0
52
53class CtCDPI_Callouts():
54        CD_end = 0
55        Ct_starts = 0
56        Ct_ends = 0
57        CD_starts = 0
58        CD_ends = 0
59        PI_starts = 0
60        PI_name_starts = 0
61        PI_name_ends = 0
62        PI_ends = 0
63        CtCDPI_mask = 0
64        error = 0
65
66class Ref_Callouts():
67        GenRef_starts = 0
68        GenRef_ends = 0
69        DecRef_starts = 0
70        DecRef_ends = 0
71        HexRef_starts = 0
72        HexRef_ends = 0
73        error = 0
74
75class Tag_Callouts():
76        ElemName_starts = 0
77        ElemName_ends = 0
78        AttName_starts = 0
79        AttName_ends = 0
80        AttVal_starts = 0
81        AttVal_ends = 0
82        AttVal_spans = 0
83        EmptyTag_marks = 0
84        EndTag_marks = 0
85        LAngleFollow = 0
86        error = 0
87               
88class Check_streams():
89        non_ascii_name_starts = 0
90        non_ascii_names = 0
91        tag_marks = 0 
92        name_follows = 0 
93        att_refs = 0 
94        error_mask = 0
95
96class Xml_names():
97        namespace_error = 0
98
99       
100def Add_scope_streams(lex, scope1):
101        #scope1.LAngle = bitutil.Advance(lex.LAngle)
102        #scope1.Hyphen = bitutil.Advance(lex.Hyphen)
103        #scope1.QMark = bitutil.Advance(lex.QMark)
104        v = lex.LAngle | lex.Hyphen
105        w = lex.Hyphen | lex.QMark
106        v1 = bitutil.Advance(v)
107        w1 = bitutil.Advance(w)
108        scope1.LAngle = v1 &~ w1
109        scope1.Hyphen = v1 & w1
110        scope1.QMark = w1 &~ v1
111
112def Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, check_streams):
113        ctCDPI_Callouts.CD_end = 0
114        ctCDPI_Callouts.Ct_starts = 0
115        ctCDPI_Callouts.Ct_ends = 0
116        ctCDPI_Callouts.CD_starts = 0
117        ctCDPI_Callouts.CD_ends = 0
118        ctCDPI_Callouts.PI_starts = 0
119        ctCDPI_Callouts.PI_name_starts = 0
120        ctCDPI_Callouts.PI_name_ends = 0
121        ctCDPI_Callouts.PI_ends = 0
122        ctCDPI_Callouts.CtCDPI_mask = 0
123        ctCDPI_Callouts.error = 0
124        CtCDPI_starts = 0
125        Ct_errors = 0
126
127        if lex.RBracket:
128                scope1.RBracket = bitutil.Advance(lex.RBracket)
129                ctCDPI_Callouts.CD_end = bitutil.Advance(scope1.RBracket & lex.RBracket) & lex.RAngle
130        PI_start = scope1.LAngle & lex.QMark
131        CtCD_start = scope1.LAngle & lex.Exclam
132        CtCDPI_start = PI_start | CtCD_start
133
134        DoubleHyphen = scope1.Hyphen & lex.Hyphen
135        PI_end = scope1.QMark & lex.RAngle
136
137        #
138        # Initiate the scan
139        CtCDPI_Cursor = bitutil.ScanToFirst(CtCDPI_start)
140        while CtCDPI_Cursor:
141                CtCDPI_starts |= CtCDPI_Cursor
142                PI_Cursor = CtCDPI_Cursor & PI_start
143                CD_Ct_Cursor = bitutil.Advance(CtCDPI_Cursor & ~PI_Cursor)
144                CD_Cursor = CD_Ct_Cursor & lex.LBracket
145                Ct_Cursor = CD_Ct_Cursor & lex.Hyphen
146                ctCDPI_Callouts.PI_starts |= PI_Cursor
147                ctCDPI_Callouts.CD_starts |= CD_Cursor
148                ctCDPI_Callouts.Ct_starts |= Ct_Cursor
149                Ct_Cursor = bitutil.Advance(Ct_Cursor) 
150                Ct_errors |= Ct_Cursor & ~ lex.Hyphen
151                # Advance twice past <!--, so that we don't treat <!---
152                # as being a terminated comment.
153                Ct_Cursor = bitutil.Advance(bitutil.Advance(Ct_Cursor))
154                PI_Cursor = bitutil.Advance(PI_Cursor)
155                ctCDPI_Callouts.PI_name_starts |= PI_Cursor
156                PI_name_end = bitutil.ScanThru(PI_Cursor, lex.NameScan)
157                ctCDPI_Callouts.PI_name_ends |= PI_name_end
158                PI_Cursor = bitutil.ScanTo(PI_name_end, PI_end)
159                CD_Cursor = bitutil.ScanTo(CD_Cursor, ctCDPI_Callouts.CD_end)
160                Ct_Cursor = bitutil.Advance(bitutil.ScanTo(Ct_Cursor, DoubleHyphen))
161                ctCDPI_Callouts.PI_ends |= PI_Cursor
162                ctCDPI_Callouts.CD_ends |= CD_Cursor
163                ctCDPI_Callouts.Ct_ends |= Ct_Cursor
164                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
165                CtCDPI_Cursor = bitutil.ScanTo(CtCDPI_Cursor, CtCDPI_start)
166       
167                ctCDPI_Callouts.CtCDPI_mask = bitutil.Advance(ctCDPI_Callouts.CD_ends | ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - CtCDPI_starts             
168                #ctCDPI_Callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
169                ctCDPI_Callouts.error = Ct_errors | ctCDPI_Callouts.Ct_ends & ~lex.RAngle
170                ctCDPI_Callouts.error |= bitutil.Advance(ctCDPI_Callouts.PI_name_ends & ~ lex.WS) & ~ PI_end
171                ctCDPI_Callouts.error |= ctCDPI_Callouts.PI_name_starts & ctCDPI_Callouts.PI_name_ends
172                # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
173                ctCDPI_Callouts.error |= ctCDPI_Callouts.CtCDPI_mask &~ EOF_mask
174               
175
176def Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts):
177
178       
179        # Delimiters for scans.
180        DQuoteDelim = lex.DQuote | lex.LAngle
181        SQuoteDelim = lex.SQuote | lex.LAngle
182        AttListDelim = lex.Slash | lex.RAngle
183       
184        # Start the parallel parsing by inspecting the character
185        # after the opening "<" of a tag.
186        tag_Callouts.LAngleFollow = scope1.LAngle &~ ctCDPI_Callouts.CtCDPI_mask
187        tag_Callouts.ElemName_starts = tag_Callouts.LAngleFollow & ~lex.Slash
188        tag_Callouts.EndTag_marks = tag_Callouts.LAngleFollow & lex.Slash
189       
190        # Start Tag/Empty Element Tag Parsing
191
192        # Advance all cursors by scanning through the tag name.
193        tag_Callouts.ElemName_ends = bitutil.ScanThru(tag_Callouts.ElemName_starts, lex.NameScan)
194        # Must have at least one name character for a legal start tag.
195        # Mark any occurrences of null names as errors.
196        ParseError = tag_Callouts.ElemName_starts & tag_Callouts.ElemName_ends
197       
198        # Initialize the accumulators for attribute name and value positions.
199        tag_Callouts.AttName_starts = 0 
200        tag_Callouts.AttName_ends = 0
201        EqToCheck = 0
202        tag_Callouts.AttVal_starts = 0
203        AttValEnds = 0
204        tag_Callouts.AttVal_ends = 0
205
206        # After the element name, there may or may not be an attlist.
207        AfterWS = bitutil.ScanThru(tag_Callouts.ElemName_ends, lex.WS)
208        AttListEnd = AfterWS & AttListDelim
209        AttNameStart = AfterWS & ~AttListDelim
210        # At least one WS character is required between ElemNames and AttNames.
211        ParseError |= tag_Callouts.ElemName_ends & AttNameStart
212
213        #
214        # The following loop iterates through attributes within a start tag.
215        # Because all start tags are processed in parallel, the number of
216        # iterations is the maximum number of attributes found in any one
217        # start tag, plus one.
218        while AttNameStart:
219                tag_Callouts.AttName_starts |= AttNameStart
220                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
221                tag_Callouts.AttName_ends |= AttNameFollow
222                # Scan through WS to the expected '=' delimiter.
223                # EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
224                # But use if test to optimize.
225                if simd_and(AttNameFollow, lex.WS): 
226                        EqExpected = bitutil.ScanThru(AttNameFollow, lex.WS)
227                else: EqExpected = AttNameFollow
228                EqToCheck |= EqExpected
229                AttValPos = bitutil.ScanThru(EqExpected, EqExpected | lex.WS)
230                tag_Callouts.AttVal_starts |= AttValPos
231                DQuoteAttVal = AttValPos & lex.DQuote
232                SQuoteAttVal = AttValPos & lex.SQuote
233                DQuoteAttEnd = bitutil.ScanTo(DQuoteAttVal, DQuoteDelim &~ DQuoteAttVal)
234                SQuoteAttEnd = bitutil.ScanTo(SQuoteAttVal, SQuoteDelim &~ SQuoteAttVal)
235                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
236                AttValEnds |= AttValEnd
237                AttValFollow = bitutil.Advance(AttValEnd)
238                tag_Callouts.AttVal_ends |= AttValFollow
239                #  AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
240                if simd_and(AttValFollow, lex.WS): 
241                        AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
242                else: AfterWS = AttValFollow
243                AfterWS = bitutil.ScanThru(AttValFollow, lex.WS)
244                AttListEnd |= AfterWS & AttListDelim
245                AttNameStart = AfterWS & ~AttListDelim
246
247        # No more attribute values to process when AttNameStart == 0.
248        STagEnds = AttListEnd & lex.RAngle
249        # Mark any "/" characters found as the ends of empty element tags.
250        tag_Callouts.EmptyTag_marks = bitutil.Advance(AttListEnd & lex.Slash)
251       
252        # Check for errors.
253        ParseError |= tag_Callouts.AttVal_ends & tag_Callouts.AttName_starts # No intervening WS.
254        ParseError |= tag_Callouts.AttName_starts & tag_Callouts.AttName_ends # Null AttName
255        ParseError |= EqToCheck & ~lex.Equals # = not found where expected.
256        ParseError |= tag_Callouts.AttVal_starts & ~ (lex.DQuote | lex.SQuote)
257        ParseError |= AttValEnds & ~ (lex.DQuote | lex.SQuote)
258        ParseError |= tag_Callouts.EmptyTag_marks & ~lex.RAngle
259
260        # End Tag Parsing
261
262        EndTagEnds = bitutil.ScanThru(tag_Callouts.EndTag_marks, tag_Callouts.EndTag_marks | lex.NameScan)
263        if simd_and(EndTagEnds, lex.WS):
264                EndTagEnds = bitutil.ScanThru(EndTagEnds, lex.WS)
265        ParseError |= EndTagEnds & ~lex.RAngle
266        tag_Callouts.error = ParseError
267               
268        # Attribute value spans
269        tag_Callouts.AttVal_spans = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
270                       
271def Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts):
272        ref_Callouts.GenRef_starts = 0
273        ref_Callouts.GenRef_ends = 0
274        ref_Callouts.DecRef_starts = 0
275        ref_Callouts.DecRef_ends = 0
276        ref_Callouts.HexRef_starts = 0
277        ref_Callouts.HexRef_ends = 0
278        ref_Callouts.error = 0
279
280        Ref1 = lex.RefStart &~ ctCDPI_Callouts.CtCDPI_mask
281        # All remaining "&" must be reference start characters; parse them.
282        if Ref1:
283                scope1.RefStart = bitutil.Advance(Ref1)
284                NumRef2 = scope1.RefStart & lex.Hash
285                ref_Callouts.GenRef_starts = scope1.RefStart &~ lex.Hash
286                NumRef3 = bitutil.Advance(NumRef2)
287                HexRef3 = NumRef3 & lex.x
288                ref_Callouts.DecRef_starts = NumRef3 &~ lex.x
289                ref_Callouts.HexRef_starts = bitutil.Advance(HexRef3) 
290                ref_Callouts.GenRef_ends = bitutil.ScanThru(ref_Callouts.GenRef_starts, lex.NameScan)
291                ref_Callouts.DecRef_ends = bitutil.ScanThru(ref_Callouts.DecRef_starts, lex.Digit)
292                ref_Callouts.HexRef_ends = bitutil.ScanThru(ref_Callouts.HexRef_starts, lex.Hex)
293                # Error checks
294                # At least one digit required for DecRef, one hex digit for HexRef.
295                ref_error1 = ref_Callouts.DecRef_starts &~ lex.Digit
296                ref_error2 = ref_Callouts.HexRef_starts &~ lex.Hex
297                # Semicolon terminator required (also covers unterminated at EOF).
298                ref_ends = ref_Callouts.GenRef_ends | ref_Callouts.DecRef_ends | ref_Callouts.HexRef_ends
299                ref_error3 = ref_ends &~ lex.Semicolon
300                ref_Callouts.error = ref_error1 | ref_error2 | ref_error3
301   
302def Do_check_streams(ctCDPI_Callouts, tag_Callouts, lex, u8, scope1, ref_Callouts, xml_names, check_streams):
303    # Ensure that no occurrence of ]]> occurs outside of markup.
304    CD_end_error = ctCDPI_Callouts.CD_end & ~(ctCDPI_Callouts.CtCDPI_mask | tag_Callouts.AttVal_spans)
305           
306    # Consolidate and check for errors
307    check_streams.error_mask = lex.error & EOF_mask | ctCDPI_Callouts.error | tag_Callouts.error | CD_end_error | ref_Callouts.error
308
309    check_streams.tag_marks = tag_Callouts.EmptyTag_marks | tag_Callouts.LAngleFollow | tag_Callouts.AttName_starts
310    check_streams.name_follows = tag_Callouts.ElemName_ends | tag_Callouts.AttName_ends
311    check_streams.att_refs = tag_Callouts.AttVal_spans & scope1.RefStart
312   
313   
314def Main(lex, scope1, ctCDPI_Callouts, ref_Callouts, tag_Callouts, masks, xml_names, check_streams):   
315       
316        Add_scope_streams(lex, scope1)
317   
318        # Parse all comments, CDATA sections and processing instructions.
319        Parse_CtCDPI(ctCDPI_Callouts, lex, scope1, check_streams)
320               
321        # All remaining '<' must be tag start characters; parse tags.
322        Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts) 
323
324        # All remaining '&' must be reference start characters; parse them.
325        Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts)   
326   
327        Do_check_streams(ctCDPI_Callouts, tag_Callouts, lex, scope1, ref_Callouts, check_streams)
328
329
Note: See TracBrowser for help on using the repository browser.