source: proto/parallel_tag_parse.py @ 261

Last change on this file since 261 was 261, checked in by cameron, 10 years ago

Research prototypes directory.

File size: 7.0 KB
RevLine 
[261]1#
2# parallel_tag_parse.py
3#
4# Parallel XML Tag Parsing with Bitstream Addition
5# Conceptual Prototype
6# Robert D. Cameron
7# revised Jan. 25, 2009
8#
9# We use python's unlimited precision integers for unbounded bit streams.
10# This permits simple logical operations on the entire stream.
11# Assumption: bitstreams are little-endian (e.g., as on x86).
12#
13# XML Start Tag Syntax
14#    '<' Name (S Name S? '=' S? (('"' [^<"]* '"') | ("'" [^<']* "'")))* S? '>'
15# Note:  '<' cannot occur in legal AttValues. 
16#
17# This demo models the situation after the document prolog (XML
18# declaration, DTD) # and all Comments, PIs and CDATA sections
19# have been identified and eliminated from further processing.
20
21# Utility functions for demo purposes.
22
23# Given a string and a boolean function on characters, make a bit stream
24# as an unlimited precision integer.   This is slow, but simple.
25def mk_bitstream(str, fn):
26        stream = 0
27        bit = 1
28        for ch in str:
29                if fn(ch): stream += bit
30                bit += bit
31        return stream
32
33#
34def readfile(filename):
35        f = open(filename)
36        contents = f.read()
37        f.close()
38        return contents
39
40def bitstream2string(stream, lgth):
41        str = ""
42        for i in range(lgth):
43                if stream & 1 == 1: str += '1'
44                else: str += '_'
45                stream >>= 1
46        return str
47
48#
49#  ScanThru is the core operation for parallel scanning.
50#  Give a bitstream Cursors marking current cursor positions,
51#  advance each cursor through occurrences of a character
52#  class identified by ScanStream.
53
54def ScanThru(Cursors, ScanStream):
55        return (Cursors + ScanStream) & ~ScanStream
56
57#
58# Advance all cursors by one position.
59def Advance(stream):
60        return stream + stream
61
62
63# The main demo function.  Assume no prolog/PI/Ct/CDATA remain.
64
65def parse_tags(XML_doc):
66        # Add a sentinel to force errors at EOF.
67        x8data=XML_doc+'<'
68
69        # Define some needed character class bitstreams.
70        WS = mk_bitstream(x8data, lambda c: c in " \t\n\r")
71        LAngle = mk_bitstream(x8data, lambda c: c == '<')
72        RAngle = mk_bitstream(x8data, lambda c: c == '>')
73        Equal = mk_bitstream(x8data, lambda c: c == '=')
74        DQuote = mk_bitstream(x8data, lambda c: c == '"')
75        DQuoteDelim = DQuote | LAngle
76        SQuote = mk_bitstream(x8data, lambda c: c == '\'')
77        SQuoteDelim = SQuote | LAngle
78        Slash = mk_bitstream(x8data, lambda c: c == '/')
79        AttListDelim = Slash | RAngle
80        # The following is an incomplete definition for demo purposes only.
81        NameDelim = mk_bitstream(x8data, lambda c: c in "#=/><\"\'; \t\n\r&,|!?][()")
82       
83        # Start the parallel parsing by inspecting the character
84        # after the opening "<" of a tag.
85        LAngleFollow = Advance(LAngle)
86        ElemNamePositions = LAngleFollow & ~Slash
87        EndTagSeconds = LAngleFollow & Slash
88       
89        # Start Tag/Empty Element Tag Parsing
90
91        # Advance all cursors by scanning through the tag name.
92        ElemNameFollows = ScanThru(ElemNamePositions, ~NameDelim)
93        # Must have at least one name character for a legal start tag.
94        # Mark any occurrences of null names as errors.
95        ParseError = ElemNamePositions & ElemNameFollows
96       
97        # Initialize the accumulators for attribute name and value positions.
98        AttNameStarts = 0 
99        AttNameFollows = 0
100        EqToCheck = 0
101        AttValStarts = 0
102        AttValEnds = 0
103        AttValFollows = 0
104
105        # After the element name, there may or may not be an attlist.
106        AfterWS = ScanThru(ElemNameFollows, WS)
107        AttListEnd = AfterWS & AttListDelim
108        AttNameStart = AfterWS & ~AttListDelim
109        # At least one WS character is required between ElemNames and AttNames.
110        ParseError |= ElemNameFollows & AttNameStart
111
112        #
113        # The following loop iterates through attributes within a start tag.
114        # Because all start tags are processed in parallel, the number of
115        # iterations is the maximum number of attributes found in any one
116        # start tag, plus one.
117        while AttNameStart > 0:
118                AttNameStarts |= AttNameStart
119                AttNameFollow = ScanThru(AttNameStart, ~NameDelim)
120                AttNameFollows |= AttNameFollow
121                # Scan through WS to the expected '=' delimiter.
122                EqExpected = ScanThru(AttNameFollow, WS)
123                EqToCheck |= EqExpected
124                AttValPos = ScanThru(Advance(EqExpected), WS)
125                AttValStarts |= AttValPos
126                DQuoteAttVal = AttValPos & DQuote
127                SQuoteAttVal = AttValPos & SQuote
128                DQuoteAttEnd = ScanThru(Advance(DQuoteAttVal), ~DQuoteDelim)
129                SQuoteAttEnd = ScanThru(Advance(SQuoteAttVal), ~SQuoteDelim)
130                AttValEnd = DQuoteAttEnd | SQuoteAttEnd
131                AttValEnds |= AttValEnd
132                AttValFollow = Advance(AttValEnd)
133                AttValFollows |= AttValFollow
134                AfterWS = ScanThru(AttValFollow, WS)
135                AttListEnd |= AfterWS & AttListDelim
136                AttNameStart = AfterWS & ~AttListDelim
137
138        # No more attribute values to process when AttNameStart == 0.
139
140        STagEnds = AttListEnd & RAngle
141        # Mark any "/" characters found as the ends of empty element tags.
142        EmptyTagEnds = Advance(AttListEnd & Slash)
143
144        # Check for errors.
145        ParseError |= AttValFollows & AttNameStarts # No intervening WS.
146        ParseError |= AttNameStarts & AttNameFollows # Null AttName
147        ParseError |= EqToCheck & ~Equal # = not found where expected.
148        ParseError |= AttValStarts & ~ (DQuote | SQuote)
149        ParseError |= AttValEnds & ~ (DQuote | SQuote)
150        ParseError |= EmptyTagEnds & ~RAngle
151
152        # End Tag Parsing
153        EndTagEnds = ScanThru(ScanThru(Advance(EndTagSeconds), ~NameDelim), WS)
154        ParseError |= EndTagEnds & ~RAngle
155
156        return {'ElemNamePositions' : ElemNamePositions,
157                'ElemNameFollows' : ElemNameFollows,
158                'STagEnds' : STagEnds,
159                'EmptyTagEnds' : EmptyTagEnds,
160                'ParseError' : ParseError,
161                'AttNameStarts' : AttNameStarts,
162                'AttNameFollows' : AttNameFollows,
163                'AttValStarts' : AttValStarts,
164                'AttValEnds' : AttValEnds,
165                'EndTagSeconds' : EndTagSeconds,
166                'EndTagEnds' : EndTagEnds}
167
168
169def show_results(xml_data, parse_results, outfile):
170        doc_lgth = len(xml_data)
171        outfile.write("Document:%s%s\n" % (" "*16, xml_data))
172        for k in ['ElemNamePositions', 'ElemNameFollows', 'STagEnds',
173                  'EmptyTagEnds', 'AttNameStarts', 'AttNameFollows', 'AttValStarts', 'AttValEnds',
174                        'EndTagSeconds', 'EndTagEnds']:
175                outfile.write("%s:%s%s\n" % (k, ' '*(24-len(k)), bitstream2string(parse_results[k], doc_lgth)))
176        k = 'ParseError'
177        outfile.write("%s:%s%s\n" % (k, ' '*(24-len(k)), bitstream2string(parse_results[k], doc_lgth+1)))
178       
179               
180demos= ["<t a='1' b=\"478\" c='d'> stuff <a/> more <b zz3  =  '222'/>   and more </t>",
181        "<trash]]> <more trash='all'badatt='3'> < a=' '> ",
182        "<broken =   ' 37'/>  <good not ~= 'DDD'>  <font color = red>  <bad_empty / >",
183        "<illegal att='badguy<'  att2='good>' ><OK  att2='good>' >",
184        "<good/> <unterminated a='",
185        "<good/> <unterminated a=''",
186        "<good/> <unterminated a=''/",
187        "<good/> <unterminated a=''/>",
188        "<good/> <unterminated a='",
189        "<good/> <unterminated a=",
190        "<good/> <unterminated a",
191        "<good/> <unterminated ",
192        "<good/> <unterminated",
193        "<WSerror a = '345'b='123'>",
194        "<EqExpectedError  a * 'b'> <OK a = 'b'>  <err2 a = 'b' c * 'd'>" 
195]
196
197import sys
198def main():
199        if len(sys.argv) < 2:
200                for d in demos:
201                        results = parse_tags(d)
202                        show_results(d, results, sys.stdout)
203        else: 
204                x8data = readfile(sys.argv[1])
205                results = parse_tags(x8data)
206                if len(sys.argv) == 3:
207                        outfile = open(sys.argv[2],"w")
208                        show_results(x8data, results, outfile)
209                        outfile.close()
210                else: show_results(x8data, results, sys.stdout)
211if __name__ == "__main__": main()
212
213
Note: See TracBrowser for help on using the repository browser.