source: proto/parallel_tag_parse.py @ 491

Last change on this file since 491 was 275, checked in by cameron, 10 years ago

Combined CallOut? streams

File size: 7.2 KB
Line 
1#
2# parallel_tag_parse.py
3#
4# Parallel XML Tag Parsing with Bitstream Addition
5# Conceptual Prototype
6# Robert D. Cameron
7# revised Jan. 25, 2009
8#
9# We use python's unlimited precision integers for unbounded bit streams.
10# This permits simple logical operations on the entire stream.
11# Assumption: bitstreams are little-endian (e.g., as on x86).
12#
13# XML Start Tag Syntax
14#    '<' Name (S Name S? '=' S? (('"' [^<"]* '"') | ("'" [^<']* "'")))* S? '>'
15# Note:  '<' cannot occur in legal AttValues. 
16#
17# This demo models the situation after the document prolog (XML
18# declaration, DTD) # and all Comments, PIs and CDATA sections
19# have been identified and eliminated from further processing.
20
21# Utility functions for demo purposes.
22
23# Given a string and a boolean function on characters, make a bit stream
24# as an unlimited precision integer.   This is slow, but simple.
25def mk_bitstream(str, fn):
26        stream = 0
27        bit = 1
28        for ch in str:
29                if fn(ch): stream += bit
30                bit += bit
31        return stream
32
33#
34def readfile(filename):
35        f = open(filename)
36        contents = f.read()
37        f.close()
38        return contents
39
40def bitstream2string(stream, lgth):
41        str = ""
42        for i in range(lgth):
43                if stream & 1 == 1: str += '1'
44                else: str += '_'
45                stream >>= 1
46        return str
47
48#
49#  ScanThru is the core operation for parallel scanning.
50#  Give a bitstream Cursors marking current cursor positions,
51#  advance each cursor through occurrences of a character
52#  class identified by ScanStream.
53
54def ScanThru(Cursors, ScanStream):
55        return (Cursors + ScanStream) & ~ScanStream
56
57#
58# Advance all cursors by one position.
59def Advance(stream):
60        return stream + stream
61
62
63# The main demo function.  Assume no prolog/PI/Ct/CDATA remain.
64
65def parse_tags(XML_doc):
66        # Add a sentinel to force errors at EOF.
67        x8data=XML_doc+'<'
68
69        # Define some needed character class bitstreams.
70        WS = mk_bitstream(x8data, lambda c: c in " \t\n\r")
71        LAngle = mk_bitstream(x8data, lambda c: c == '<')
72        RAngle = mk_bitstream(x8data, lambda c: c == '>')
73        Equal = mk_bitstream(x8data, lambda c: c == '=')
74        DQuote = mk_bitstream(x8data, lambda c: c == '"')
75        DQuoteDelim = DQuote | LAngle
76        SQuote = mk_bitstream(x8data, lambda c: c == '\'')
77        SQuoteDelim = SQuote | LAngle
78        Slash = mk_bitstream(x8data, lambda c: c == '/')
79        AttListDelim = Slash | RAngle
80        # The following is an incomplete definition for demo purposes only.
81        NameDelim = mk_bitstream(x8data, lambda c: c in "#=/><\"\'; \t\n\r&,|!?][()")
82       
83        # Start the parallel parsing by inspecting the character
84        # after the opening "<" of a tag.
85        LAngleFollow = Advance(LAngle)
86        ElemNamePositions = LAngleFollow & ~Slash
87        EndTagSeconds = LAngleFollow & Slash
88       
89        # Start Tag/Empty Element Tag Parsing
90
91        # Advance all cursors by scanning through the tag name.
92        ElemNameFollows = ScanThru(ElemNamePositions, ~NameDelim)
93        # Must have at least one name character for a legal start tag.
94        # Mark any occurrences of null names as errors.
95        ParseError = ElemNamePositions & ElemNameFollows
96        ElemNames = ElemNameFollows - ElemNamePositions
97       
98        # Initialize the accumulators for attribute name and value positions.
99        AttNameStarts = 0 
100        AttNameFollows = 0
101        EqToCheck = 0
102        AttValStarts = 0
103        AttValEnds = 0
104        AttValFollows = 0
105
106        # After the element name, there may or may not be an attlist.
107        AfterWS = ScanThru(ElemNameFollows, WS)
108        AttListEnd = AfterWS & AttListDelim
109        AttNameStart = AfterWS & ~AttListDelim
110        # At least one WS character is required between ElemNames and AttNames.
111        ParseError |= ElemNameFollows & AttNameStart
112
113        #
114        # The following loop iterates through attributes within a start tag.
115        # Because all start tags are processed in parallel, the number of
116        # iterations is the maximum number of attributes found in any one
117        # start tag, plus one.
118        while AttNameStart > 0:
119                AttNameStarts |= AttNameStart
120                AttNameFollow = ScanThru(AttNameStart, ~NameDelim)
121                AttNameFollows |= AttNameFollow
122                # Scan through WS to the expected '=' delimiter.
123                EqExpected = ScanThru(AttNameFollow, WS)
124                EqToCheck |= EqExpected
125                AttValPos = ScanThru(Advance(EqExpected), WS)
126                AttValStarts |= AttValPos
127                DQuoteAttVal = AttValPos & DQuote
128                SQuoteAttVal = AttValPos & SQuote
129                DQuoteAttEnd = ScanThru(Advance(DQuoteAttVal), ~DQuoteDelim)
130                SQuoteAttEnd = ScanThru(Advance(SQuoteAttVal), ~SQuoteDelim)
131                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
132                AttValEnds |= AttValEnd
133                AttValFollow = Advance(AttValEnd)
134                AttValFollows |= AttValFollow
135                AfterWS = ScanThru(AttValFollow, WS)
136                AttListEnd |= AfterWS & AttListDelim
137                AttNameStart = AfterWS & ~AttListDelim
138
139        # No more attribute values to process when AttNameStart == 0.
140
141        AttNames = AttNameFollows - AttNameStarts
142        AttVals = AttValFollows - AttValStarts
143        STagEnds = AttListEnd & RAngle
144        # Mark any "/" characters found as the ends of empty element tags.
145        EmptyTagEnds = Advance(AttListEnd & Slash)
146        Tags = (STagEnds | EmptyTagEnds) - ElemNamePositions
147
148        # Check for errors.
149        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
150        ParseError |= AttNameStarts & AttNameFollows # Null AttName
151        ParseError |= EqToCheck & ~Equal # = not found where expected.
152        ParseError |= AttValStarts & ~ (DQuote | SQuote)
153        ParseError |= AttValEnds & ~ (DQuote | SQuote)
154        ParseError |= EmptyTagEnds & ~RAngle
155
156        # End Tag Parsing
157        EndTagEnds = ScanThru(ScanThru(Advance(EndTagSeconds), ~NameDelim), WS)
158        ParseError |= EndTagEnds & ~RAngle
159        EndTags = EndTagEnds - EndTagSeconds
160
161        return {'ElemNames' : ElemNames,
162                'AttNames' : AttNames,
163                'AttVals' : AttVals,
164                'Tags' : Tags,
165                'EmptyTagMarks' : EmptyTagEnds,
166                'EndTags' : EndTags,
167                'ParseError' : ParseError}
168
169
170def show_results(xml_data, parse_results, outfile):
171        doc_lgth = len(xml_data)
172        outfile.write("Document:%s%s\n" % (" "*16, xml_data))
173#       for k in ['ElemNamePositions', 'ElemNameFollows', 'STagEnds',
174#                 'EmptyTagEnds', 'AttNameStarts', 'AttNameFollows', 'AttValStarts', 'AttValEnds',
175#                       'EndTagSeconds', 'EndTagEnds']:
176        for k in ['ElemNames', 'AttNames', 'AttVals', 'Tags', 'EmptyTagMarks', 'EndTags']:
177                outfile.write("%s:%s%s\n" % (k, ' '*(24-len(k)), bitstream2string(parse_results[k], doc_lgth)))
178        k = 'ParseError'
179        outfile.write("%s:%s%s\n" % (k, ' '*(24-len(k)), bitstream2string(parse_results[k], doc_lgth+1)))
180       
181               
182demos= ["<t a='1' b=\"478\" c='d'> stuff <a/> more <b zz3  =  '222'/>   and more </t>",
183        "<trash]]> <more trash='all'badatt='3'> < a=' '> ",
184        "<broken =   ' 37'/>  <good not ~= 'DDD'>  <font color = red>  <bad_empty / >",
185        "<illegal att='badguy<'  att2='good>' ><OK  att2='good>' >",
186        "<good/> <unterminated a='",
187        "<good/> <unterminated a=''",
188        "<good/> <unterminated a=''/",
189        "<good/> <unterminated a=''/>",
190        "<good/> <unterminated a='",
191        "<good/> <unterminated a=",
192        "<good/> <unterminated a",
193        "<good/> <unterminated ",
194        "<good/> <unterminated",
195        "<WSerror a = '345'b='123'>",
196        "<EqExpectedError  a * 'b'> <OK a = 'b'>  <err2 a = 'b' c * 'd'>",
197        "<Top x='1'><a><leaf>4</leaf>;<void/></a></top>"
198]
199
200import sys
201def main():
202        if len(sys.argv) < 2:
203                for d in demos:
204                        results = parse_tags(d)
205                        show_results(d, results, sys.stdout)
206        else: 
207                x8data = readfile(sys.argv[1])
208                results = parse_tags(x8data)
209                if len(sys.argv) == 3:
210                        outfile = open(sys.argv[2],"w")
211                        show_results(x8data, results, outfile)
212                        outfile.close()
213                else: show_results(x8data, results, sys.stdout)
214if __name__ == "__main__": main()
215
216
Note: See TracBrowser for help on using the repository browser.