source: proto/parabix2/asm.py @ 2592

Last change on this file since 2592 was 391, checked in by ksherdy, 9 years ago

Replaced SpanStreamStartsCursor? with SpanStreamsEndsCursor?.

  • Property svn:executable set to *
File size: 14.1 KB
Line 
1#
2# asm.py
3#
4# Array Set Model
5#
6# Kenneth S. Herdy
7# September 10, 2009
8#
9#----------------------------------------------------------------------------
10#
11# We use Parabix version 2 bit streams defined bit streams to prototype
12# the construction of the Saxon 6.5.5 TinyTree.
13#
14#----------------------------------------------------------------------------
15#
16# Processing Streams
17#
18# Processing streams represent a general class of bit streams used in bit stream programming. Member instances of processing streams
19# are used to construct byte space scanning and parsing operations in bit space. For example, a cursor stream, scan stream and
20# span stream may be used in combination to accumulate the lengths of each element name contained within an XML document in bit
21# space.
22#
23# Position Streams    - Marks the start position of each token with a 1bit.
24#                       Position Streams = Scan Stream & (~Advance(Scan Stream))
25#
26# Span Streams        - Marks the extents of each tokens within with a series of 1 bits.
27#                       Span Stream = End Position Stream - Start Position Streams
28#
29# Scan Stream         - Marks the position of each token with a 0 bit.
30#                       Scan Stream = ~Position Stream
31#
32# Cursor Stream       - Marks the one or more cursor positions with a 1 bit.
33#
34# Deletion Streams    - Marks bit positions for deletion.
35#
36# Mask Stream         - Conditionally marks all positions in the mask stream as all 1s or all 0s.   
37#                       Mask = ~Condition + 1; where Condition = 1 if true or Condition = 0 otherwise
38#
39#----------------------------------------------------------------------------
40# Core Processing Stream Operations
41#
42# 1. Get byte space position.
43# 2. Get byte space token length.
44# 3. Generate 3 bit codes.
45# 4. Filter bits / filter bytes.
46# 5. Accumulate population counts. For example, node counts.
47#
48
49import byteutil
50import byteclass
51import parabix2
52import bitutil
53import u8u16
54
55import sys
56
57#   REFERENCE ONLY (FOR TINYTREE IMPLEMENTATION)
58#   
59#       If we have streams for ElemNames that distinguish Start / End tag pairs versus Empty elements then
60#   this would enable element depth calculations based the above loop.       
61#
62def PosStreams2Depths(StartTagEnds, EmptyTagEnds, EndTagEnds, PIStarts, CtStarts, TextStarts, EOFMask):
63
64    Depth = [-1] * 10
65    NodeIndex = -1
66    CrtDepth = 0
67    Cursor = 1
68           
69    # Construct scan stream
70    ScanStream = ~(StartTagEnds | EmptyTagEnds | EndTagEnds | PIStarts | CtStarts | TextStarts) & EOFMask
71    Cursor = bitutil.ScanThru(Cursor, ScanStream)
72    Cursor &= EOFMask
73   
74    while Cursor > 0:
75        NotEndTagEndType = ((Cursor & (StartTagEnds | EmptyTagEnds | PIStarts | CtStarts | TextStarts)) >> bitutil.count_leading_zeroes(Cursor))
76       
77        TypeMask = ~NotEndTagEndType + 1
78       
79        # Increment or Decrement current depth conditionally
80        CrtDepth += ((Cursor & StartTagEnds) >> bitutil.count_leading_zeroes(Cursor))
81        CrtDepth -= ((Cursor & EmptyTagEnds) >> bitutil.count_leading_zeroes(Cursor))
82       
83        print str(NotEndTagEndType) + ',' + str( bitutil.extract_bit( (StartTagEnds | EmptyTagEnds | PIStarts | CtStarts | TextStarts), bitutil.count_leading_zeroes(Cursor) ) )
84       
85        NodeIndex += NotEndTagEndType 
86        Depth[NodeIndex] = (Depth[NodeIndex] & (~TypeMask)) | ((CrtDepth) & (TypeMask)) 
87       
88        Cursor = bitutil.Advance(Cursor)
89        Cursor = bitutil.ScanThru(Cursor,ScanStream)
90        Cursor &= EOFMask
91       
92    return Depth
93
94#   REFERENCE ONLY (FOR TINYTREE IMPLEMENTATION)
95def bitstream_2_child_attr_count(start_empty_tag_extents, attr_name_extents, EOF_mask):
96   
97    count = [0]*10
98    elem_pos = -1
99    cursor = 1
100    type = 0
101
102    start_empty_tag_pos = start_empty_tag_extents & (~Advance(start_empty_tag_extents))
103    attr_name_pos = attr_name_extents & (~Advance(attr_name_extents))   
104       
105    # construct scan stream
106    scan_stream = ~(start_empty_tag_pos | attr_name_pos) & EOF_mask
107    cursor = bitutil.ScanThru(cursor, scan_stream)
108    cursor &= EOF_mask
109   
110    while cursor > 0:
111        type = ((cursor & start_empty_tag_pos) >> bitutil.count_leading_zeroes(cursor))
112        elem_pos = elem_pos + type
113        count[elem_pos]= count[elem_pos] + ((~type) & 1)
114
115        cursor = Advance(cursor)
116        cursor = bitutil.ScanThru(cursor,scan_stream)
117        cursor &= EOF_mask
118
119    return count
120
121class NodeInfo:
122        # SAXON
123        # Type                  Integer         # Bit Stream            Binary Code
124        ELEMENT         =       1                       # ElemNamesStarts       0001
125        ATTRIBUTE       =       2 
126        TEXT            =       3                       # TextStarts            0011
127        PI          =   7                       # PIStarts                      0111
128        COMMENT     =   8                       # CtStarts                      1000
129        ROOT        =   9   
130        NAMESPACE   =   13
131
132        # ASM
133        END_TAGS        =       4                       # EndTagStarts          0100
134        CDATA           =       5                       # Undefined                     Undefined   
135
136
137#
138#----------------------------------------------------------------------------
139# Transforms a span stream into an array of span start positions.
140#----------------------------------------------------------------------------
141# Notes
142#
143def SpanStream2BytePosArray(SpanStream, EOFMask):
144    PosArray = []
145   
146    # Construct scan stream
147    SpanStarts = SpanStream & (~bitutil.Advance(SpanStream))
148    ScanStream = (~SpanStarts) & EOFMask
149   
150    # Set cursor
151    Cursor = 1
152    Cursor = bitutil.ScanThru(Cursor,ScanStream)
153    Cursor &= EOFMask
154   
155    while Cursor > 0:
156        # Accumulate position value
157        PosArray.append(bitutil.count_leading_zeroes(Cursor))
158
159        # Advance cursor
160        Cursor = bitutil.Advance(Cursor)
161        Cursor = bitutil.ScanThru(Cursor,ScanStream)
162        Cursor &= EOFMask
163    return PosArray
164   
165#
166#----------------------------------------------------------------------------
167# Transforms a span stream into an array of span lengths.
168#----------------------------------------------------------------------------
169# Notes
170#
171def SpanStream2LengthArray(SpanStream, EOFMask, LengthMultiplier=1):
172    LgthArray = []
173   
174    # Construct scan stream
175    SpanStreamStarts = SpanStream & (~bitutil.Advance(SpanStream))
176    SpanStreamEnds = SpanStream + SpanStreamStarts
177    ScanStream = ~(SpanStreamStarts | SpanStreamEnds) & EOFMask
178       
179    # Set start position cursor
180    SpanStreamStartsCursor = 1
181    SpanStreamStartsCursor = bitutil.ScanThru(SpanStreamStartsCursor,ScanStream)
182    SpanStreamStartsCursor &= EOFMask
183   
184    while SpanStreamStartsCursor > 0:
185        # Set end positions cursor
186        SpanStreamEndsCursor = bitutil.Advance(SpanStreamStartsCursor)
187        SpanStreamEndsCursor = bitutil.ScanThru(SpanStreamStartsCursor,ScanStream)
188        SpanStreamEndsCursor &= EOFMask
189       
190        # Accumulate span length
191        LgthArray.append((bitutil.count_leading_zeroes(SpanStreamEndsCursor) - bitutil.count_leading_zeroes(SpanStreamStartsCursor) + 1) * LengthMultiplier)
192       
193        # Advance cursor
194        SpanStreamStartsCursor = bitutil.Advance(SpanStreamEndsCursor)
195        SpanStreamStartsCursor = bitutil.ScanThru(SpanStreamStartsCursor,ScanStream)
196        SpanStreamStartsCursor &= EOFMask
197    return LgthArray
198
199#
200#----------------------------------------------------------------------------
201# Transforms a set of span stream into a TinyTree array list.
202#----------------------------------------------------------------------------
203# Notes
204#
205# 1. Add support for the Namespace node type
206# 2. A potential exists to use this method as an interleaving method, post TEXT, ATTR, ... processing
207# 2. Look into XPDM string value of a node
208
209def BuildTinyTreeArrays(ElemNames, PISpans, CtSpans, CDSpans, TextSpans, EmptyTagMarks, EndTagSpans, EOFMask, U16TextLengths):
210        MaxArrayLength = 100
211       
212        # NodeType
213        NodeIndex = 0
214        NodeCount = 0
215        NodeType = [-1] * MaxArrayLength
216       
217        # Offset / Length
218        CrtU16TextPos = 0
219        U16TextLengthIndex = 0
220        Offset = [-1] * MaxArrayLength
221        Length = [-1] * MaxArrayLength
222       
223        # Depth
224        CrtDepth = 0
225        Depth = [-1] * MaxArrayLength       
226       
227        # Construct starts
228        ElemNamesStarts = ElemNames & ~bitutil.Advance(ElemNames)
229        PIStarts = PISpans & ~bitutil.Advance(PISpans)
230        CtStarts = CtSpans & ~bitutil.Advance(CtSpans)
231        CDStarts = CDSpans & ~bitutil.Advance(CDSpans)
232        TextStarts = TextSpans & ~bitutil.Advance(TextSpans)
233        EndTagStarts = EndTagSpans & ~bitutil.Advance(EndTagSpans)             
234       
235        # Construct depth streams
236        IncDepth = ElemNamesStarts       
237        DecDepth = EmptyTagMarks | EndTagStarts
238       
239        # Construct node count increment stream
240        NodeCountInc = ElemNamesStarts | PIStarts | CtStarts | TextStarts
241       
242        # Mark all node start positions
243        AllNodeStarts = NodeCountInc | EndTagStarts
244        AllNodesScanStream = ~AllNodeStarts & EOFMask
245       
246        # Construct Bit Codes streams
247        CodeBit0 = ElemNamesStarts | TextStarts | PIStarts
248        CodeBit1 = TextStarts | PIStarts
249        CodeBit2 = PIStarts | EndTagStarts
250        CodeBit3 = CtStarts
251       
252                # Set cursor
253        Cursor = 1
254        Cursor = bitutil.ScanThru(Cursor, AllNodesScanStream)
255        Cursor &= EOFMask
256               
257        while Cursor > 0:
258                   
259                Pos = bitutil.count_leading_zeroes(Cursor)
260                               
261                                # Accumulate node type codes
262                Code = bitutil.extract_bit(CodeBit3, Pos) << 3 | bitutil.extract_bit(CodeBit2,Pos) << 2 | bitutil.extract_bit(CodeBit1,Pos) << 1 | bitutil.extract_bit(CodeBit0,Pos)
263                NodeType[NodeIndex] = Code
264                               
265                # Depths
266                Depth[NodeIndex] = CrtDepth
267                CrtDepth = CrtDepth + bitutil.extract_bit(IncDepth, Pos) - bitutil.extract_bit(DecDepth, Pos)
268
269                #Text Lengths
270                if Code == 3:
271                    Offset[NodeIndex] = CrtU16TextPos
272                    Length[NodeIndex] = U16TextLengths[U16TextLengthIndex]
273                    CrtU16TextPos += Length[NodeIndex] 
274                    U16TextLengthIndex += 1
275
276                                # Increment node counter
277                NodeIndex += bitutil.extract_bit(Cursor & NodeCountInc,Pos)
278                         
279                # Advance cursor         
280                Cursor = bitutil.Advance(Cursor)
281                Cursor = bitutil.ScanThru(Cursor, AllNodesScanStream)
282                Cursor &= EOFMask
283        NodeCount = NodeIndex
284                       
285        return (NodeCount, NodeType, Depth, Offset, Length)
286
287def DemoTinyTree(U8Data):
288    lgth = len(U8Data)
289
290    # Parse
291    (CtCDPIMarkup, Tags, Refs, U16Hi, U16Lo, DelMask, Error, Lex, U16DelMask, EOF_mask) = parabix2.parabix_parse(U8Data)
292   
293    # Text
294   
295    # 1. Construct UTF-8 indexed Text Spans
296    U8TextSpans = ~(Lex.LAngle | Tags.Tags | bitutil.Advance(Tags.Tags) | Tags.EndTags | bitutil.Advance(Tags.EndTags) | CtCDPIMarkup.CtCDPI_mask) & EOF_mask
297
298    # 2. Extract UTF-8 Text into U8TextBuffer
299    U8TextBuffer = bitutil.filter_bytes(U8Data, (~U8TextSpans & EOF_mask))
300   
301    U8TextLengths = SpanStream2LengthArray(U8TextSpans, EOF_mask)
302             
303    # 3. Transcode UTF-8 text bufer to UTF-16
304    (bit, U8TextBuffer_EOF_mask) = bitutil.transpose_streams(U8TextBuffer)     
305   
306    # Classify bytes for UTF-8 processing, whitespace and control
307    # processing and XML lexical analysis.
308    (u8, control, lex) = byteclass.classify_bytes(bit)
309
310    # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams.
311    u8 = u8u16.validate_utf8(u8)   
312   
313    # Convert to UTF-16 bit streams.
314    (u16TextBufferHi, u16TextBufferLo, u16TextBufferDelMask) = u8u16.u8u16(u8, bit)
315   
316    # Inverse transpose_u8_byte_streams
317    U8TextBufferLength = 0
318    for U8TextLength in U8TextLengths:
319        U8TextBufferLength = U8TextBufferLength + U8TextLength
320   
321    U16TextBufferHi = bitutil.filter_bytes(bitutil.inverse_transpose(u16TextBufferHi, U8TextBufferLength), u16TextBufferDelMask)
322    U16TextBufferLo = bitutil.filter_bytes(bitutil.inverse_transpose(u16TextBufferLo, U8TextBufferLength), u16TextBufferDelMask)
323   
324    # Construct UTF-16 data buffer
325    U16TextBuffer = bitutil.merge_bytes(U16TextBufferLo, U16TextBufferHi)   
326   
327    # 4. Apply UTF-16 deletion mask to UTF-8 text spans to extract UTF-16 text spans.
328    U16TextSpans = bitutil.filter_bits(U8TextSpans, U16DelMask)
329   
330    U16TextLengths = SpanStream2LengthArray(U16TextSpans, EOF_mask, 2)
331   
332    print U8TextLengths
333    print U16TextLengths
334
335#    DEBUG
336#    start = 0
337#    end = 0
338#    for i in range(len(U16TextLengths)):
339#        end = start + U16TextLengths[i]
340#        print str(start) + str(end)
341#        print U16TextBuffer[start:end].decode('utf16').encode('utf-8')
342#        start = end
343
344         
345    print U16TextBuffer.decode('utf16').encode('utf-8')
346
347   
348    bitutil.print_aligned_u8_byte_streams([ ('input data', U8Data),
349                                    ('U8TextSpans', bitutil.bitstream2string(U8TextSpans, lgth)),
350                                    ('EOF_mask',bitutil.bitstream2string(EOF_mask,lgth)),
351                                    ('U16DelMask', bitutil.bitstream2string(U16DelMask, lgth)),
352                                    ('U16TextSpans', bitutil.bitstream2string(U16TextSpans, lgth)),
353                                    ('U8TextBuffer', U8TextBuffer)
354                                    ])
355                                           
356    # Declare TinyTree arrays
357    NodeType = []
358    Offset = []
359    Length = []
360    Depth = []
361   
362    (NodeCount, NodeType, Depth, Offset, Length) = BuildTinyTreeArrays(Tags.ElemNames, CtCDPIMarkup.PI_span, CtCDPIMarkup.Ct_span, CtCDPIMarkup.CD_span, U8TextSpans, Tags.EmptyTagMarks, Tags.EndTags, EOF_mask, U16TextLengths)   
363   
364    Info = NodeInfo()
365    CrtOffset = 0
366    CrtLength = 0
367    TextIndex = 0
368       
369    byteutil.print_arrays([ ('Node Count', NodeCount),
370                            ('NodeType', NodeType),
371                            ('Depth', Depth),
372                            ('Offset', Offset),
373                            ('Length', Length)
374                            ])   
375   
376    return
377
378if __name__ == "__main__":
379    #import doctest
380    #doctest.testmod()
381    if len(sys.argv) > 1:
382        # Read file as UTF-8 and convert to UTF-8 bytes
383        U8Data = bitutil.readfile(sys.argv[1]) 
384        DemoTinyTree(U8Data)
385        #parabix2.demo_parabix(U8Data)
386    else:
387        print("Usage: python parabix2.py <file>") 
388   
Note: See TracBrowser for help on using the repository browser.