Ignore:
Timestamp:
Feb 2, 2011, 11:41:20 PM (8 years ago)
Author:
ksherdy
Message:

Update template and compilable for separate compilation.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/parabix2/parabix2_pablo.py

    r866 r881  
    44#
    55# Parallel XML Parsing with Bitstream Addition
     6#
    67# - Complete prototype for all bitstream computations in Parabix2
    7 # - optimized for compilation
    8 #
     8# - Optimized for compilation
     9# - Separate compilation
     10
    911# Robert D. Cameron
    1012# July 29, 2010
     
    4345  xF4_scope = 0
    4446  xEF_scope = 0
     47 
     48  FFFE_FFFF = 0
    4549  error = 0
    4650
    47 class Lex (BitStreamSet):
     51class Lex ():
    4852        x00_x1F = 0
    4953        CR = 0
     
    8286        CtCDPI_start = 0
    8387
    84 class Scope1 (BitStreamSet):
     88class Scope1 ():
    8589        RefStart = 0
    8690        LAngle = 0
     
    8993        RBracket = 0
    9094
    91 class CtCDPI_Callouts(BitStreamSet):
     95class CtCDPI_Callouts():
    9296        Ct_starts = 0
    9397        Ct_ends = 0
     
    101105        error = 0
    102106
    103 class Ref_Callouts(BitStreamSet):
     107class Ref_Callouts():
    104108        GenRef_starts = 0
    105109        GenRef_ends = 0
     
    110114        error = 0
    111115
    112 class Tag_Callouts(BitStreamSet):
     116class Tag_Callouts():
    113117        ElemName_starts = 0
    114118        ElemName_ends = 0
     
    117121        AttVal_starts = 0
    118122        AttVal_ends = 0
     123        AttVal_spans = 0
    119124        EmptyTag_marks = 0
    120125        EndTag_marks = 0
     126        LAngleFollow = 0
    121127        error = 0
    122 
    123 def main(u8data):
    124         # Classify bytes for UTF-8 processing, whitespace and control
    125         # processing and XML lexical analysis.
    126 
    127         temp1 = (bit[0] | bit[1]);
    128         temp2 = (bit[2] &~ bit[3]);
     128       
     129class Xml_char():
     130        error = 0       
     131       
     132class Basis_bits():     
     133        bit_0 = 0
     134        bit_1 = 0
     135        bit_2 = 0
     136        bit_3 = 0
     137        bit_4 = 0
     138        bit_5 = 0
     139        bit_6 = 0
     140        bit_7 = 0
     141       
     142class Masks():
     143        EOF_mask = 0   
     144
     145class Post_process():
     146        misc_mask = 0
     147        non_ascii_name_starts = 0
     148        non_ascii_names = 0
     149        tag_marks = 0
     150        name_follows = 0
     151        att_refs = 0
     152        error_mask = 0
     153
     154class Xml_names():
     155        namespace_error = 0
     156
     157def Classify_bytes_Validate_utf8(basis_bits, lex, u8): 
     158        temp1 = (basis_bits.bit_0 | basis_bits.bit_1);
     159        temp2 = (basis_bits.bit_2 &~ basis_bits.bit_3);
    129160        temp3 = (temp2 &~ temp1);
    130         temp4 = (bit[5] &~ bit[4]);
    131         temp5 = (bit[6] &~ bit[7]);
     161        temp4 = (basis_bits.bit_5 &~ basis_bits.bit_4);
     162        temp5 = (basis_bits.bit_6 &~ basis_bits.bit_7);
    132163        temp6 = (temp4 & temp5);
    133164        lex.RefStart = (temp3 & temp6);
    134         temp7 = (bit[2] & bit[3]);
     165        temp7 = (basis_bits.bit_2 & basis_bits.bit_3);
    135166        temp8 = (temp7 &~ temp1);
    136         temp9 = (bit[4] &~ bit[5]);
    137         temp10 = (bit[6] & bit[7]);
     167        temp9 = (basis_bits.bit_4 &~ basis_bits.bit_5);
     168        temp10 = (basis_bits.bit_6 & basis_bits.bit_7);
    138169        temp11 = (temp9 & temp10);
    139170        lex.Semicolon = (temp8 & temp11);
    140         temp12 = (bit[4] & bit[5]);
    141         temp13 = (bit[6] | bit[7]);
     171        temp12 = (basis_bits.bit_4 & basis_bits.bit_5);
     172        temp13 = (basis_bits.bit_6 | basis_bits.bit_7);
    142173        temp14 = (temp12 &~ temp13);
    143174        lex.LAngle = (temp8 & temp14);
    144175        temp15 = (temp12 & temp5);
    145176        lex.RAngle = (temp8 & temp15);
    146         temp16 = (bit[1] &~ bit[0]);
    147         temp17 = (bit[3] &~ bit[2]);
     177        temp16 = (basis_bits.bit_1 &~ basis_bits.bit_0);
     178        temp17 = (basis_bits.bit_3 &~ basis_bits.bit_2);
    148179        temp18 = (temp16 & temp17);
    149180        lex.LBracket = (temp18 & temp11);
    150         temp19 = (bit[7] &~ bit[6]);
     181        temp19 = (basis_bits.bit_7 &~ basis_bits.bit_6);
    151182        temp20 = (temp12 & temp19);
    152183        lex.RBracket = (temp18 & temp20);
    153         temp21 = (bit[4] | bit[5]);
     184        temp21 = (basis_bits.bit_4 | basis_bits.bit_5);
    154185        temp22 = (temp19 &~ temp21);
    155186        lex.Exclam = (temp3 & temp22);
     
    172203        temp30 = (temp18 & temp23);
    173204        temp31 = (temp30 | lex.Colon);
    174         temp32 = (temp16 &~ bit[2]);
    175         temp33 = (bit[5] | temp10);
    176         temp34 = (bit[4] & temp33);
     205        temp32 = (temp16 &~ basis_bits.bit_2);
     206        temp33 = (basis_bits.bit_5 | temp10);
     207        temp34 = (basis_bits.bit_4 & temp33);
    177208        temp35 = (~temp34);
    178209        temp36 = (temp21 | temp13);
    179         temp37 = ((bit[3] & temp35)|(~(bit[3]) & temp36));
     210        temp37 = ((basis_bits.bit_3 & temp35)|(~(basis_bits.bit_3) & temp36));
    180211        temp38 = (temp32 & temp37);
    181212        temp39 = (temp31 | temp38);
    182         temp40 = (temp16 & bit[2]);
     213        temp40 = (temp16 & basis_bits.bit_2);
    183214        temp41 = (temp40 & temp37);
    184215        lex.ASCII_name_start = (temp39 | temp41);
     
    190221        temp47 = (temp46 | temp38);
    191222        lex.ASCII_name_char = (temp47 | temp41);
    192         lex.NameScan = (lex.ASCII_name_char | bit[0]);
    193         temp48 = (temp1 | bit[2]);
     223        lex.NameScan = (lex.ASCII_name_char | basis_bits.bit_0);
     224        temp48 = (temp1 | basis_bits.bit_2);
    194225        lex.x00_x1F = (~temp48);
    195         temp49 = (bit[2] | bit[3]);
     226        temp49 = (basis_bits.bit_2 | basis_bits.bit_3);
    196227        temp50 = (temp1 | temp49);
    197228        lex.CR = (temp20 &~ temp50);
     
    204235        temp54 = (temp53 &~ temp50);
    205236        lex.WS = (temp54 | lex.SP);
    206         temp55 = (bit[5] | bit[6]);
    207         temp56 = (bit[4] & temp55);
     237        temp55 = (basis_bits.bit_5 | basis_bits.bit_6);
     238        temp56 = (basis_bits.bit_4 & temp55);
    208239        lex.Digit = (temp8 &~ temp56);
    209240        temp57 = (temp16 &~ temp49);
    210         temp58 = (temp57 &~ bit[4]);
     241        temp58 = (temp57 &~ basis_bits.bit_4);
    211242        temp59 = (~temp10);
    212         temp60 = ((bit[5] & temp59)|(~(bit[5]) & temp13));
     243        temp60 = ((basis_bits.bit_5 & temp59)|(~(basis_bits.bit_5) & temp13));
    213244        temp61 = (temp58 & temp60);
    214245        temp62 = (lex.Digit | temp61);
    215246        temp63 = (temp16 & temp2);
    216         temp64 = (temp63 &~ bit[4]);
     247        temp64 = (temp63 &~ basis_bits.bit_4);
    217248        temp65 = (temp64 & temp60);
    218249        lex.Hex = (temp62 | temp65);
    219 
    220 
    221         # Check for illegal control characters
    222         xmlchar_error = (lex.x00_x1F &~ lex.WS & EOF_mask)
    223 
    224 
    225         u8.unibyte = (~bit[0]);
     250       
     251        ### Validate_utf8(basis_bits, u8):
     252        u8.unibyte = (~basis_bits.bit_0);
    226253        u8.suffix = 0
    227254        u8.error = 0
    228         FFFE_FFFF = 0
    229         u8anyscope = 0
    230         if bit[0]:
    231                 u8.prefix = (bit[0] & bit[1]);
    232                 u8.prefix2 = (u8.prefix &~ bit[2]);
     255        u8.FFFE_FFFF = 0
     256        u8anyscope = 0 #local
     257        if basis_bits.bit_0:
     258                u8.prefix = (basis_bits.bit_0 & basis_bits.bit_1);
     259                u8.prefix2 = (u8.prefix &~ basis_bits.bit_2);
    233260                u8.prefix3 = (u8.prefix & temp2);
    234261                u8.prefix4 = (u8.prefix & temp7);
    235                 u8.suffix = (bit[0] &~ bit[1]);
     262                u8.suffix = (basis_bits.bit_0 &~ basis_bits.bit_1);
    236263                temp66 = (u8.prefix &~ temp49);
    237                 temp67 = (temp21 | bit[6]);
     264                temp67 = (temp21 | basis_bits.bit_6);
    238265                temp68 = (temp66 &~ temp67);
    239                 temp69 = (bit[5] & temp13);
    240                 temp70 = (bit[4] | temp69);
     266                temp69 = (basis_bits.bit_5 & temp13);
     267                temp70 = (basis_bits.bit_4 | temp69);
    241268                temp71 = (u8.prefix4 & temp70);
    242269                u8.badprefix = (temp68 | temp71);
     
    244271                u8.scope22 = bitutil.Advance(u8.prefix2)
    245272                u8anyscope = u8.scope22
    246                 if u8.prefix3 | u8.prefix4:
     273                if u8.prefix3 | u8.prefix4:
    247274                        xE0 = (u8.prefix3 &~ temp36);
    248275                        xED = (u8.prefix3 & temp20);
     
    250277                        temp72 = (temp4 &~ temp13);
    251278                        xF4 = (u8.prefix4 & temp72);
    252                         u8.xA0_xBF = (u8.suffix & bit[2]);
    253                         u8.x80_x9F = (u8.suffix &~ bit[2]);
     279                        u8.xA0_xBF = (u8.suffix & basis_bits.bit_2);
     280                        u8.x80_x9F = (u8.suffix &~ basis_bits.bit_2);
    254281                        u8.x90_xBF = (u8.suffix & temp49);
    255282                        u8.x80_x8F = (u8.suffix &~ temp49);
     
    281308                        EF_BF_pending = bitutil.Advance(u8.xEF_scope & u8.xBF)
    282309
    283                         FFFE_FFFF = (EF_BF_pending & (u8.xBE | u8.xBF))
     310                        u8.FFFE_FFFF = (EF_BF_pending & (u8.xBE | u8.xBF))
    284311        u8mismatch = u8anyscope ^ u8.suffix
    285312        u8.error |= u8mismatch
    286 
    287 
    288         xmlchar_error |= FFFE_FFFF
    289 
     313       
     314def Add_multiliterals(scope1, lex):
    290315        scope1.LAngle = bitutil.Advance(lex.LAngle)
    291316        scope1.Hyphen = bitutil.Advance(lex.Hyphen)
    292317        scope1.QMark = bitutil.Advance(lex.QMark)
    293 
    294318        lex.CD_end = 0
    295319        if lex.RBracket:
    296320                scope1.RBracket = bitutil.Advance(lex.RBracket)
    297321                lex.CD_end = bitutil.Advance(scope1.RBracket & lex.RBracket) & lex.RAngle
    298 
    299         # Compute XML multilterals such as <?, </, --, ]]>.
    300         #lex = add_multiliterals(lex)
    301322        lex.PI_start = scope1.LAngle & lex.QMark
    302323        lex.CtCD_start = scope1.LAngle & lex.Exclam
     
    308329        lex.PI_end = scope1.QMark & lex.RAngle
    309330
     331def Parse_CtCDPI(ctCDPI_Callouts, lex, masks, post_process):
    310332        ctCDPI_Callouts.Ct_starts = 0
    311333        ctCDPI_Callouts.Ct_ends = 0
     
    318340        ctCDPI_Callouts.CtCDPI_mask = 0
    319341        ctCDPI_Callouts.error = 0
    320 
    321342        CtCDPI_starts = 0
    322343
    323344        # Scanning streams
    324         CtCDPI_scan = ~lex.CtCDPI_start & EOF_mask
    325         Ct_end_scan = ~lex.DoubleHyphen & EOF_mask
    326         CD_end_scan = ~lex.CD_end & EOF_mask
    327         PI_end_scan = ~lex.PI_end & EOF_mask
     345        CtCDPI_scan = ~lex.CtCDPI_start & masks.EOF_mask
     346        Ct_end_scan = ~lex.DoubleHyphen & masks.EOF_mask
     347        CD_end_scan = ~lex.CD_end & masks.EOF_mask
     348        PI_end_scan = ~lex.PI_end & masks.EOF_mask
    328349        #
    329350        # Initiate the scan
    330351        CtCDPI_Cursor = 1
    331352        CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
    332         CtCDPI_Cursor &= EOF_mask
     353        CtCDPI_Cursor &= masks.EOF_mask
    333354        while CtCDPI_Cursor:
    334355                CtCDPI_starts |= CtCDPI_Cursor
     
    357378                CtCDPI_Cursor = PI_Cursor | CD_Cursor | Ct_Cursor
    358379                CtCDPI_Cursor = bitutil.ScanThru(CtCDPI_Cursor, CtCDPI_scan)
    359                 CtCDPI_Cursor &= EOF_mask
    360        
    361                 ctCDPI_Callouts.CtCDPI_mask = bitutil.Advance(ctCDPI_Callouts.CD_ends | ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - CtCDPI_starts
     380                CtCDPI_Cursor &= masks.EOF_mask
     381       
     382                ctCDPI_Callouts.CtCDPI_mask = bitutil.Advance(ctCDPI_Callouts.CD_ends | ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - CtCDPI_starts             
    362383                #ctCDPI_Callouts.error = Ct_ends & ~lex.RAngle | Ct_starts & ~ lex.Hyphen
    363384                ctCDPI_Callouts.error = Ct_errors | ctCDPI_Callouts.Ct_ends & ~lex.RAngle
     
    365386                ctCDPI_Callouts.error |= ctCDPI_Callouts.PI_name_starts & ctCDPI_Callouts.PI_name_ends
    366387                # If any of the Comment, CDATA or PI markups are unterminated, it is an error.
    367                 ctCDPI_Callouts.error |= ctCDPI_Callouts.CtCDPI_mask &~ EOF_mask
     388                ctCDPI_Callouts.error |= ctCDPI_Callouts.CtCDPI_mask &~ masks.EOF_mask
    368389               
    369         Misc_mask = (lex.WS | lex.LAngle | (bitutil.Advance(ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - (ctCDPI_Callouts.Ct_starts | ctCDPI_Callouts.PI_starts)) | CtCDPI_starts) & EOF_mask
    370 
     390        post_process.misc_mask = (lex.WS | lex.LAngle | (bitutil.Advance(ctCDPI_Callouts.Ct_ends | ctCDPI_Callouts.PI_ends) - (ctCDPI_Callouts.Ct_starts | ctCDPI_Callouts.PI_starts)) | CtCDPI_starts) & masks.EOF_mask
     391
     392def Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts, masks):
    371393       
    372394        # Delimiters for scans.
    373         DQuoteScan = ~(lex.DQuote | lex.LAngle) & EOF_mask
    374         SQuoteScan = ~(lex.SQuote | lex.LAngle) & EOF_mask
     395        DQuoteScan = ~(lex.DQuote | lex.LAngle) & masks.EOF_mask
     396        SQuoteScan = ~(lex.SQuote | lex.LAngle) & masks.EOF_mask
    375397        AttListDelim = lex.Slash | lex.RAngle
    376398       
    377399        # Start the parallel parsing by inspecting the character
    378400        # after the opening "<" of a tag.
    379         LAngleFollow = scope1.LAngle &~ ctCDPI_Callouts.CtCDPI_mask
    380         tag_Callouts.ElemName_starts = LAngleFollow & ~lex.Slash
    381         tag_Callouts.EndTag_marks = LAngleFollow & lex.Slash
     401        tag_Callouts.LAngleFollow = scope1.LAngle &~ ctCDPI_Callouts.CtCDPI_mask
     402        tag_Callouts.ElemName_starts = tag_Callouts.LAngleFollow & ~lex.Slash
     403        tag_Callouts.EndTag_marks = tag_Callouts.LAngleFollow & lex.Slash
    382404       
    383405        # Start Tag/Empty Element Tag Parsing
     
    410432        # start tag, plus one.
    411433        while AttNameStart:
    412                 tag_Callouts.AttName_starts |= AttNameStart
     434                tag_Callouts.AttName_starts |= AttNameStart
    413435                AttNameFollow = bitutil.ScanThru(AttNameStart, lex.NameScan)
    414436                tag_Callouts.AttName_ends |= AttNameFollow
     
    447469        ParseError |= EndTagEnds & ~lex.RAngle
    448470        tag_Callouts.error = ParseError
    449 
    450 
    451 #
    452 #  Reference parsing
    453 #
    454 #def parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts):
     471               
     472        # Attribute value spans
     473        tag_Callouts.AttVal_spans = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
     474                       
     475def Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts):
    455476        ref_Callouts.GenRef_starts = 0
    456477        ref_Callouts.GenRef_ends = 0
     
    479500                ref_error2 = ref_Callouts.HexRef_starts &~ lex.Hex
    480501                # Semicolon terminator required (also covers unterminated at EOF).
    481                 ref_ends = ref_Callouts.GenRef_ends | ref_Callouts.DecRef_ends | ref_Callouts.HexRef_ends
     502                ref_ends = ref_Callouts.GenRef_ends | ref_Callouts.DecRef_ends | ref_Callouts.HexRef_ends
    482503                ref_error3 = ref_ends &~ lex.Semicolon
    483504                ref_Callouts.error = ref_error1 | ref_error2 | ref_error3
    484505
    485 
    486         AttVals = tag_Callouts.AttVal_ends - tag_Callouts.AttVal_starts
    487         # Ensure that no occurrence of ]]> occurs outside of markup.
    488         CD_end_error = lex.CD_end & ~(ctCDPI_Callouts.CtCDPI_mask | AttVals)
    489        
    490 
    491        
    492         #name and name start checking streams
     506def Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, post_process):
    493507        PI_names = ctCDPI_Callouts.PI_name_ends - ctCDPI_Callouts.PI_name_starts
    494         GenRefs = ref_Callouts.GenRef_ends - ref_Callouts.GenRef_starts
     508        GenRefs = ref_Callouts.GenRef_ends - ref_Callouts.GenRef_starts
    495509        ElemNames = tag_Callouts.ElemName_ends - tag_Callouts.ElemName_starts
    496510        AttNames = tag_Callouts.AttName_ends - tag_Callouts.AttName_starts
     
    499513        name_stream = qname_stream | ncname_stream
    500514        name_start = name_stream &~ bitutil.Advance(name_stream)
    501         name_start_check = name_start & ~lex.ASCII_name_start
    502         name_check = (name_stream &~ name_start) & ~lex.ASCII_name_char & ~u8.suffix
    503        
    504         #namespace validation
    505515        name_cursor = name_stream & ~bitutil.Advance(name_stream)
    506516        void_prefix_err = name_cursor & lex.Colon
     
    510520        colon2_err = bitutil.ScanThru(local_part_start, lex.NameScan &~ lex.Colon) & lex.Colon
    511521        ncname_err = ncname_stream & lex.Colon
    512         namespace_error = void_prefix_err | local_part_err | colon2_err | ncname_err
    513        
    514        
     522        xml_names.namespace_error = void_prefix_err | local_part_err | colon2_err | ncname_err
     523                       
     524        post_process.non_ascii_name_starts = name_start &~lex.ASCII_name_start
     525        post_process.non_ascii_names = (name_stream &~ name_start) & ~lex.ASCII_name_char & ~u8.suffix
     526
     527#def main(basis_bits, lex, u8, scope1, ctCDPI_Callouts, masks, post_process, tag_Callouts, ref_Callouts, xml_names):
     528def Main(basis_bits, lex, u8, xml_char, scope1, ctCDPI_Callouts, ref_Callouts, tag_Callouts, masks, xml_names, post_process):   
     529       
     530        # Classify bytes for UTF-8 processing, whitespace and control
     531        # processing and XML lexical analysis.
     532        # Classify_bytes(basis_bits, lex)
     533
     534        # Validate UTF-8 multibyte sequences and determine the UTF-8 scope streams
     535        # Validate_utf8(basis_bits, u8)
     536                               
     537        Classify_bytes_Validate_utf8(basis_bits, lex, u8)
     538
     539        # Check for illegal control characters
     540        xml_char.error = (lex.x00_x1F &~ lex.WS & masks.EOF_mask)
     541        xml_char.error |= u8.FFFE_FFFF
     542
     543        ### Compute XML multiliterals such as <?, </, --, ]]>
     544        Add_multiliterals(scope1, lex)
     545   
     546    # Parse all comments, CDATA sections and processing instructions.
     547        Parse_CtCDPI(ctCDPI_Callouts, lex, masks, post_process)
     548               
     549        # All remaining '<' must be tag start characters; parse tags.
     550        Parse_tags(lex, scope1, ctCDPI_Callouts, tag_Callouts, masks)
     551
     552        # All remaining '&' must be reference start characters; parse them.
     553        Parse_refs(lex, scope1, ctCDPI_Callouts, ref_Callouts)
     554       
     555        # Ensure that no occurrence of ]]> occurs outside of markup.
     556        CD_end_error = lex.CD_end & ~(ctCDPI_Callouts.CtCDPI_mask | tag_Callouts.AttVal_spans)
     557       
     558        # Validate XML namespaces and generate bit streams to post validate non-ascii range XML names
     559        Validate_xml_names(ctCDPI_Callouts, ref_Callouts, tag_Callouts, lex, u8, xml_names, post_process)
     560               
    515561        # Consolidate and check for errors
    516         error_mask = u8.error | xmlchar_error | ctCDPI_Callouts.error | tag_Callouts.error | CD_end_error | ref_Callouts.error | namespace_error
    517 
    518         tag_marks = tag_Callouts.EmptyTag_marks | LAngleFollow | tag_Callouts.AttName_starts
    519         NameFollows = tag_Callouts.ElemName_ends | tag_Callouts.AttName_ends
    520         AttRef = AttVals & scope1.RefStart
    521        
     562        post_process.error_mask = u8.error | xml_char.error | ctCDPI_Callouts.error | tag_Callouts.error | CD_end_error | ref_Callouts.error | xml_names.namespace_error
     563
     564        post_process.tag_marks = tag_Callouts.EmptyTag_marks | tag_Callouts.LAngleFollow | tag_Callouts.AttName_starts
     565        post_process.name_follows = tag_Callouts.ElemName_ends | tag_Callouts.AttName_ends
     566        post_process.att_refs = tag_Callouts.AttVal_spans & scope1.RefStart             
Note: See TracChangeset for help on using the changeset viewer.