source: proto/parabix2/parse_tags_prescan.py @ 2163

Last change on this file since 2163 was 2000, checked in by cameron, 7 years ago

Tag parsing with attribute value prescanning

File size: 3.2 KB
Line 
1#
2# parse_tags_prescan.py
3#
4# A different version of parse_tags, with additional parallelism.
5# Only attribute value strings are parsed sequentially; all
6# other aspects of tag parsing are in parallel.
7#
8#
9#
10# Robert D. Cameron
11# April 7, 2012
12#
13
14def Parse_tags(lex, marker, tag_Callouts):
15       
16        AttListDelim = lex.Slash | lex.RAngle
17        Quote = lex.DQuote | lex.SQuote
18        transition_scanclass = Quote | AttListDelim
19
20        # Start the parallel parsing by inspecting the character
21        # after the opening "<" of a tag.
22        tag_Callouts.ElemName_starts = marker.LAngle_scope & ~lex.Slash
23        tag_Callouts.EndTag_marks = marker.LAngle_scope & lex.Slash
24       
25        # Advance all cursors by scanning through the tag name.
26        tag_Callouts.ElemName_ends = pablo.ScanThru(tag_Callouts.ElemName_starts, lex.NameScan)
27        # Must have at least one name character for a legal start tag.
28        # Mark any occurrences of null names as errors.
29        ParseError = tag_Callouts.ElemName_starts & tag_Callouts.ElemName_ends
30
31        #
32        # Prescanning - look for quoted strings first
33        #
34        tag_Callouts.AttVal_starts = 0
35        tag_Callouts.AttVal_ends = 0
36        transition_marker = pablo.ScanTo(tag_Callouts.ElemName_ends, transition_scanclass)
37        quote_marker = transition_marker & Quote
38        while quote_marker:
39                dq = quote_marker & lex.DQuote
40                sq = quote_marker & lex.SQuote
41                tag_Callouts.AttVal_starts |= quote_marker
42                end_quote = pablo.ScanTo(dq, lex.DQuote &~ dq) | pablo.ScanTo(sq, lex.SQuote &~ sq)
43                tag_Callouts.AttVal_ends |= end_quote
44                transition_marker = pablo.ScanTo(end_quote, transition_scanclass &~ end_quote)
45                quote_marker = transition_marker & Quote
46
47        #
48        # Now remaining work can be done in parallel for all tags
49        #
50        beforeWS = tag_Callouts.ElemName_ends | pablo.Advance(tag_Callouts.AttVal_ends)
51        requireWS = beforeWS &~ AttListDelim
52        ParseError |= requireWS &~ lex.WS
53
54        afterWS = pablo.ScanThru(beforeWS, lex.WS)
55        requireName = afterWS &~ AttListDelim
56        ParseError |= requireName &~ lex.NameScan
57       
58        tag_Callouts.AttName_starts = afterWS & lex.NameScan
59        tag_Callouts.AttName_ends = pablo.ScanThru(tag_Callouts.AttName_starts, lex.NameScan)
60        # Scan through WS to the expected '=' delimiter.
61        # EqExpected = pablo.ScanThru(tag_Callouts.AttName_ends, lex.WS)
62        # But use if test to optimize.
63        if tag_Callouts.AttName_ends & lex.WS: 
64                EqExpected = pablo.ScanThru(tag_Callouts.AttName_ends, lex.WS)
65        else: EqExpected = tag_Callouts.AttName_ends
66        ParseError |= EqExpected &~ lex.Equals
67        AttValPos = pablo.ScanThru(EqExpected, EqExpected | lex.WS)
68        ParseError |= AttValPos &~ tag_Callouts.AttVal_starts
69
70        AttListEnd = (beforeWS | afterWS) & AttListDelim
71        STagEnds = AttListEnd & lex.RAngle
72        # Mark any "/" characters found as the ends of empty element tags.
73        tag_Callouts.EmptyTag_marks = pablo.Advance(AttListEnd & lex.Slash)
74       
75        ParseError |= tag_Callouts.EmptyTag_marks & ~lex.RAngle
76
77        # End Tag Parsing
78
79        EndTagEnds = pablo.ScanThru(tag_Callouts.EndTag_marks, tag_Callouts.EndTag_marks | lex.NameScan)
80        if EndTagEnds & lex.WS:
81                EndTagEnds = pablo.ScanThru(EndTagEnds, lex.WS)
82        ParseError |= EndTagEnds & ~lex.RAngle
83        if ParseError:
84                error_tracker.NoteError("Tag parsing error found", (ParseError))
85               
86               
87        # Attribute value spans
88        tag_Callouts.AttVal_spans = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
89
90
Note: See TracBrowser for help on using the repository browser.