source: proto/parallel_block_parse.py @ 491

Last change on this file since 491 was 280, checked in by cameron, 10 years ago

Crediting Ehsan

File size: 11.4 KB
Line 
1#
2# parallel_block_parse.py
3#
4# Block by Block Parallel XML Tag Parsing with Bitstream Addition
5# Conceptual Prototype
6# adapted by Ehsan Amiri from original code by Robert D. Cameron
7# revised March. 14, 2009
8#
9# We use python's unlimited precision integers for unbounded bit streams.
10# This permits simple logical operations on the entire stream.
11# Assumption: bitstreams are little-endian (e.g., as on x86).
12#
13# XML Start Tag Syntax
14#    '<' Name (S Name S? '=' S? (('"' [^<"]* '"') | ("'" [^<']* "'")))* S? '>'
15# Note:  '<' cannot occur in legal AttValues. 
16#
17# This demo models the situation after the document prolog (XML
18# declaration, DTD) # and all Comments, PIs and CDATA sections
19# have been identified and eliminated from further processing.
20
21BlockSize = 16
22BlockDivisor = 1<<BlockSize
23Mask = BlockDivisor - 1
24
25# Utility functions for demo purposes.
26# Given a string and a boolean function on characters, make a bit stream
27# as an unlimited precision integer.   This is slow, but simple.
28def mk_bitstream(str, fn):
29        stream = 0
30        bit = 1
31        for ch in str:
32                if fn(ch): stream += bit
33                bit += bit
34        return stream
35
36#
37def readfile(filename):
38        f = open(filename)
39        contents = f.read()
40        f.close()
41        return contents
42
43def bitstream2string(stream, lgth):
44        str = ""
45        for i in range(lgth):
46                if stream & 1 == 1: str += '1'
47                else: str += '_'
48                stream >>= 1
49        return str
50
51#
52#  ScanThru is the core operation for parallel scanning.
53#  Give a bitstream Cursors marking current cursor positions,
54#  advance each cursor through occurrences of a character
55#  class identified by ScanStream.
56
57def ScanThru(Cursors, ScanStream):
58        ScanStream &= Mask
59        return (Cursors + ScanStream) & ~ScanStream
60
61#
62# Advance all cursors by one position.
63def Advance(stream):
64        return (stream + stream)&Mask
65        #return stream + stream
66
67
68# The main demo function.  Assume no prolog/PI/Ct/CDATA remain.
69
70def parse_tags(XML_doc):
71        # Add a sentinel to force errors at EOF.
72        x8data=XML_doc+'<'
73        #Each Advance variable keeps the last bit of the previous block that must be used in
74        #the current block when Advance() is applied.
75        Advance1 = 0
76        Advance2 = 0
77        Advance3 = 0
78        Advance4 = 0
79        Advance5 = 0
80        Advance6 = 0
81
82        #Each Carry variable keeps the carry produced by the ScanThru operation on the previous block.
83        Carry1 = 0
84        Carry2 = 0
85        Carry3 = 0
86        Carry4 = 0
87        Carry5 = 0
88        Carry6 = 0
89        Carry7 = 0
90        Carry8 = 0
91
92        #Advance and Carry variables for End Tag parsing.
93        EAdvance1 = 0
94        ECarry1 = 0
95        ECarry2 = 0
96
97        #Required for the inner loop. Indicates if any Carry or Advance variable is set
98        #during processing of the previous block.
99        AnyCarry = 0
100
101        #Size of input data processed so far
102        Counter = 0
103
104        #Final results i.e. return values of the function.
105        FinalElemNamePositions = 0
106        FinalElemNameFollows = 0
107        FinalSTagEnds = 0
108        FinalEmptyTagEnds = 0
109        FinalParseError = 0
110        FinalAttNameStarts = 0
111        FinalAttNameFollows = 0
112        FinalAttValStarts = 0
113        FinalAttValEnds = 0
114        FinalEndTagSeconds = 0
115        FinalEndTagEnds = 0
116
117        # This is a loop over blocks of the input text
118        while Counter < len(x8data):
119                Current = x8data[Counter: Counter+BlockSize]
120                # Define some needed character class bitstreams.
121                WS = mk_bitstream(Current, lambda c: c in " \t\n\r")
122                LAngle = mk_bitstream(Current, lambda c: c == '<')
123                RAngle = mk_bitstream(Current, lambda c: c == '>')
124                Equal = mk_bitstream(Current, lambda c: c == '=')
125                DQuote = mk_bitstream(Current, lambda c: c == '"')
126                DQuoteDelim = DQuote | LAngle
127                SQuote = mk_bitstream(Current, lambda c: c == '\'')
128                SQuoteDelim = SQuote | LAngle
129                Slash = mk_bitstream(Current, lambda c: c == '/')
130                AttListDelim = Slash | RAngle
131                # The following is an incomplete definition for demo purposes only.
132                NameDelim = mk_bitstream(Current, lambda c: c in "#=/><\"\'; \t\n\r&,|!?][()")
133
134                # Start the parallel parsing by inspecting the character
135                # after the opening "<" of a tag.
136                LAngleFollow = Advance1 | Advance(LAngle)
137                Advance1 = LAngle>>(BlockSize-1)       
138
139                ElemNamePositions = LAngleFollow & ~Slash
140                EndTagSeconds = LAngleFollow & Slash
141
142                # Start Tag/Empty Element Tag Parsing
143
144                # Advance all cursors by scanning through the tag name.
145                ElemNameFollows = ScanThru(Carry1 | ElemNamePositions, ~NameDelim)
146                #Calculating the carry of the last Scan Thru operation
147                Carry1 = (ElemNameFollows & BlockDivisor) >> BlockSize
148                #Potential carry bit of the stream is set to zero.
149                ElemNameFollows = ElemNameFollows & Mask
150               
151                # Must have at least one name character for a legal start tag.
152                # Mark any occurrences of null names as errors.
153                ParseError = ElemNamePositions & ElemNameFollows
154       
155                # Initialize the accumulators for attribute name and value positions.
156                AttNameStarts = 0 
157                AttNameFollows = 0
158                EqToCheck = 0
159                AttValStarts = 0
160                AttValEnds = 0
161                AttValFollows = 0
162
163                # After the element name, there may or may not be an attlist.
164                AfterWS = ScanThru(Carry2 | ElemNameFollows, WS)
165                Carry2 = (AfterWS & BlockDivisor) >> BlockSize
166                AfterWS = AfterWS & Mask
167                AttListEnd = AfterWS & AttListDelim
168                AttNameStart = AfterWS & ~AttListDelim
169                # At least one WS character is required between ElemNames and AttNames.
170                ParseError |= ElemNameFollows & AttNameStart
171
172                #
173                # If any Advance or Carry variable is set in the inner loop one of the variables below
174                # will be set. After the end of the inner loop, it will be moved to the original
175                # Carry or Advance variable.
176                Advance2i = 0
177                Advance3i = 0
178                Advance4i = 0
179                Advance5i = 0
180
181                Carry3i = 0
182                Carry4i = 0
183                Carry5i = 0
184                Carry6i = 0
185                Carry7i = 0
186                Carry8i = 0
187                #
188                # The following loop iterates through those attributes within a start tag that reside
189                # in the same block.Because all start tags of the same block are processed in parallel,
190                # the number of iterations is the maximum number of attributes found in any one start tag,
191                # plus one.
192                while AttNameStart > 0 or AnyCarry > 0:
193                        AnyCarry = 0
194                        AttNameStarts |= AttNameStart
195
196                        AttNameFollow = ScanThru(Carry3|AttNameStart, ~NameDelim)
197                        # If there is a carry from the previous block it must be considered only once.
198                        Carry3 = 0
199                        Carry3i |= (AttNameFollow & BlockDivisor) >> BlockSize
200                        AttNameFollow = AttNameFollow & Mask
201                        AttNameFollows |= AttNameFollow
202
203                        # Scan through WS to the expected '=' delimiter.
204                        EqExpected = ScanThru(Carry4 | AttNameFollow, WS)
205                        Carry4 = 0
206                        Carry4i |= (EqExpected & BlockDivisor) >> BlockSize
207                        EqExpected = EqExpected& Mask
208                        EqToCheck |= EqExpected
209
210                        AttValPos = ScanThru(Carry5 | Advance2 | Advance(EqExpected), WS)
211                        Carry5 = 0
212                        Carry5i |=  (AttValPos & BlockDivisor) >> BlockSize
213                        AttValPos = AttValPos & Mask
214                        Advance2 = 0
215                        Advance2i |= EqExpected >> (BlockSize-1)
216
217                        AttValStarts |= AttValPos
218                        DQuoteAttVal = AttValPos & DQuote
219                        SQuoteAttVal = AttValPos & SQuote
220
221                        DQuoteAttEnd = ScanThru(Carry6 | Advance3 | Advance(DQuoteAttVal), ~DQuoteDelim)
222                        Carry6 = 0
223                        Carry6i |= (DQuoteAttEnd & BlockDivisor) >> BlockSize
224                        DQuoteAttEnd = DQuoteAttEnd & Mask
225                        Advance3 = 0
226                        Advance3i |= DQuoteAttVal >> (BlockSize-1)
227
228                        SQuoteAttEnd = ScanThru(Carry7 | Advance4 | Advance(SQuoteAttVal), ~SQuoteDelim)
229                        Carry7 = 0
230                        Carry7i |= (SQuoteAttEnd & BlockDivisor) >> BlockSize
231                        SQuoteAttEnd = SQuoteAttEnd & Mask
232                        Advance4 = 0
233                        Advance4i |= SQuoteAttVal >> (BlockSize-1)
234
235                        AttValEnd = DQuoteAttEnd | SQuoteAttEnd
236                        AttValEnds |= AttValEnd
237                        AttValFollow = Advance5 | Advance(AttValEnd)
238                        Advance5 = 0
239                        Advance5i |= AttValEnd >> (BlockSize-1)
240                        AttValFollows |= AttValFollow
241
242                        AfterWS = ScanThru(Carry8 | AttValFollow, WS)
243                        Carry8 = 0
244                        Carry8i |= (AfterWS & BlockDivisor) >> BlockSize
245                        AfterWS = AfterWS & Mask
246                        AttListEnd |= AfterWS & AttListDelim
247                        AttNameStart = AfterWS & ~AttListDelim
248
249                # No more attribute values to process when AttNameStart == 0.
250
251                STagEnds = AttListEnd & RAngle
252                # Mark any "/" characters found as the ends of empty element tags.
253                EmptyTagEnds = Advance6 | Advance(AttListEnd & Slash)
254                Advance6 = (AttListEnd & Slash) >> (BlockSize-1)
255                # Check for errors.
256                ParseError |= AttValFollows & AttNameStarts # No intervening WS.
257                ParseError |= AttNameStarts & AttNameFollows # Null AttName
258                ParseError |= EqToCheck & ~Equal # = not found where expected.
259                ParseError |= AttValStarts & ~ (DQuote | SQuote)
260                ParseError |= AttValEnds & ~ (DQuote | SQuote)
261
262                FinalElemNamePositions |= ElemNamePositions << Counter
263                FinalElemNameFollows |= ElemNameFollows << Counter
264                FinalSTagEnds |= STagEnds << Counter
265                FinalEmptyTagEnds |= EmptyTagEnds << Counter
266                FinalAttNameStarts |= AttNameStarts << Counter
267                FinalAttNameFollows |= AttNameFollows << Counter
268                FinalAttValStarts |= AttValStarts << Counter
269                FinalAttValEnds |= AttValEnds << Counter
270
271                Advance2 = Advance2i
272                Advance3 = Advance3i
273                Advance4 = Advance4i
274                Advance5 = Advance5i
275
276                Carry3 = Carry3i
277                Carry4 = Carry4i
278                Carry5 = Carry5i
279                Carry6 = Carry6i
280                Carry7 = Carry7i
281                Carry8 = Carry8i
282
283                AnyCarry = Advance2 | Advance3 | Advance4 | Advance5 | Carry3 | Carry4 | Carry5 | Carry6 | Carry7 | Carry8
284
285                # End Tag Parsing
286                ETNameEnds = ScanThru(ECarry1 | EAdvance1 | Advance(EndTagSeconds), ~NameDelim)
287                ECarry1  = (ETNameEnds & BlockDivisor) >> BlockSize
288                ETNameEnds &= Mask
289                EAdvance1 = EndTagSeconds >> (BlockSize-1)
290
291                EndTagEnds = ScanThru(ECarry2 | ETNameEnds, WS)
292                ECarry2 = (EndTagEnds & BlockDivisor) >> BlockSize
293                EndTagEnds &= Mask
294                ParseError |= EndTagEnds & ~RAngle
295               
296                FinalEndTagSeconds |= EndTagSeconds << Counter
297                FinalEndTagEnds |= EndTagEnds << Counter
298
299                ParseError |= EmptyTagEnds & ~RAngle
300                FinalParseError |= ParseError << Counter
301
302                Counter += BlockSize
303
304        return {'ElemNamePositions' : FinalElemNamePositions,
305                'ElemNameFollows' : FinalElemNameFollows,
306                'STagEnds' : FinalSTagEnds,
307                'EmptyTagEnds' : FinalEmptyTagEnds,
308                'ParseError' : FinalParseError,
309                'AttNameStarts' : FinalAttNameStarts,
310                'AttNameFollows' : FinalAttNameFollows,
311                'AttValStarts' : FinalAttValStarts,
312                'AttValEnds' : FinalAttValEnds,
313                'EndTagSeconds' : FinalEndTagSeconds,
314                'EndTagEnds' : FinalEndTagEnds}
315
316def show_results(xml_data, parse_results, outfile):
317        doc_lgth = len(xml_data)
318        outfile.write("Document:%s%s\n" % (" "*16, xml_data))
319        for k in ['ElemNamePositions', 'ElemNameFollows', 'STagEnds',
320                  'EmptyTagEnds', 'AttNameStarts', 'AttNameFollows', 'AttValStarts', 'AttValEnds',
321                        'EndTagSeconds', 'EndTagEnds']:
322                outfile.write("%s:%s%s\n" % (k, ' '*(24-len(k)), bitstream2string(parse_results[k], doc_lgth)))
323        k = 'ParseError'
324        outfile.write("%s:%s%s\n" % (k, ' '*(24-len(k)), bitstream2string(parse_results[k], doc_lgth+1)))
325
326
327
328demos = ["<t a='1' b=\"478\" c='d'> stuff <a/> more <b zz3  =  '222'/>   and more </t>",
329        "<trash]]> <more trash='all'badatt='3'> < a=' '> ",
330        "<broken =   ' 37'/>  <good not ~= 'DDD'>  <font color = red>  <bad_empty / >",
331        "<illegal att='badguy<'  att2='good>' ><OK  att2='good>' >",
332        "<good/> <unterminated a='",
333        "<good/><a>test</a  > <unterminated a=''",
334        "<good/><a>test</a  />  <unterminated a=''/",
335        "<good/> <unterminated a=''/>",
336        "<good/> <unterminated a='",
337        "<good/> <unterminated a=",
338        "<good/> <unterminated a",
339        "<good/> <unterminated ",
340        "<good/> <unterminated",
341        "<WSerror a = '345'b='123'>",
342        "<EqExpectedError  a * 'b'> <OK a = 'b'>  <err2 a = 'b' c * 'd'>" 
343       ]
344
345import sys
346def main():
347        if len(sys.argv) < 2:
348                for d in demos:
349                        results = parse_tags(d)
350                        show_results(d, results, sys.stdout)
351        else: 
352                x8data = readfile(sys.argv[1])
353                results = parse_tags(x8data)
354                if len(sys.argv) == 3:
355                        outfile = open(sys.argv[2],"w")
356                        show_results(x8data, results, outfile)
357                        outfile.close()
358                else: show_results(x8data, results, sys.stdout)
359if __name__ == "__main__": main()
Note: See TracBrowser for help on using the repository browser.