Changeset 73


Ignore:
Timestamp:
Mar 27, 2008, 9:28:33 AM (11 years ago)
Author:
cameron
Message:

Refactored Parsing Engine

Location:
trunk/src
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/engine.c

    r66 r73  
    77
    88#include "engine.h"
    9 #include "xmlbuffer.h"
     9#include "byteplex.h"
     10#include "xmldecl.h"
    1011#include "bytelex.h"
    1112#include "bitlex.h"
     
    1819
    1920Parser_Interface * Parser_Interface::ParserFactory(char * filename) {
    20         XML_Buffer_Interface * b = XML_Buffer_Interface::BufferFactory(filename);
    21         b->DoByteplex();
    22         b->PreparePseudoASCII_Stream();
    23 #ifdef DEBUG
    24         printf("PseudoASCII stream complete.\n");
    25 #endif
    26         b->ReadXMLInfo();
    27 #ifdef DEBUG
    28         printf("XML Info read; content start position = %i.\n", b->ContentStartUnit());
    29 #endif
    30         if (b->code_unit_base == ASCII) {
    31                 return new ParsingEngine<ASCII>(b);
    32         }
    33         else /* if (b->code_unit_base == EBCDIC) */ {
    34                 return new ParsingEngine<EBCDIC>(b);
    35         }
     21       
     22        int chars_read;
     23        unsigned char signature[4];
     24        FILE * infile;
     25        infile = fopen(filename, "rb");
     26        if (!infile) {
     27                fprintf(stderr, "Error: cannot open %s for input.\n", filename);
     28                exit(-1);
     29        }
     30        fread(signature,1,4,infile);
     31        Model_Info * m = new Model_Info;
     32        m->AnalyzeSignature(signature);
     33        Byteplex * b = Byteplex::ByteplexFactory(m, infile);
     34        b->InitializeBuffer(signature,4);
     35
     36        if (m->code_unit_base == ASCII) {
     37                return new ParsingEngine<ASCII>(m, b);
     38        }
     39        else /* if (m->code_unit_base == EBCDIC) */ {
     40                return new ParsingEngine<EBCDIC>(m, b);
     41        }       
    3642}
    3743
    3844Parser_Interface::~Parser_Interface() {
    39   xml_buf->~XML_Buffer_Interface();
    4045}
    4146
    4247
    4348bool Parser_Interface::has_ByteOrderMark() {
    44         return xml_buf->BOM_units > 0;
     49        return model_info->BOM_units > 0;
    4550}
    4651
    4752XML_version Parser_Interface::get_version() {
    48         return xml_buf->version;
     53        return model_info->version;
    4954}
    5055
    5156XML_standalone Parser_Interface::standalone_status() {
    52         return xml_buf->standalone;
     57        return model_info->standalone;
    5358}
    5459
    5560bool Parser_Interface::has_EncodingDecl() {
    56         return xml_buf->has_encoding_decl;
    57 }
    58 
    59 int Parser_Interface::get_Encoding_pos() {
    60         return xml_buf->encoding_start_pos;
    61 }
    62 
    63 int Parser_Interface::get_Encoding_lgth() {
    64         return xml_buf->encoding_lgth;
     61        return model_info->has_encoding_decl;
     62}
     63
     64unsigned char * Parser_Interface::get_Encoding() {
     65        return model_info->encoding;
    6566}
    6667
    6768unsigned char * Parser_Interface::GetCodeUnitPtr(int pos) {
    68         return &((unsigned char *) (xml_buf->ByteBuffer))[pos * (int) xml_buf->code_unit_size];
    69 }
    70 
    71 
    72 
    73 
    74 template <CodeUnit_Base C>
    75 ParsingEngine<C>::ParsingEngine(XML_Buffer_Interface * b) : Parser_Interface () {
    76 #ifdef BUFFER_PROFILING
    77         bitstream_timer = init_BOM_timer(BUFFER_BLOCKS * BLOCKSIZE);
    78         lextranspose_timer = init_BOM_timer(BUFFER_BLOCKS * BLOCKSIZE);
    79         scanner_timer = init_BOM_timer(BUFFER_BLOCKS * BLOCKSIZE);
    80 #endif
     69        int rel_pos = pos - buffer_base_pos;
     70        return &((unsigned char *) (byteplex->src_buffer))[rel_pos * (int) model_info->code_unit_size];
     71}
     72
     73
     74
     75
     76template <CodeUnit_Base C>
     77ParsingEngine<C>::ParsingEngine(Model_Info * m, Byteplex * b) : Parser_Interface () {
     78
     79        model_info = m;
     80        byteplex = b;
     81
     82        byteplex->DoByteplex();
     83        byteplex->PreparePseudoASCII_Stream();
     84        decl_parser = new XML_Decl_Parser<C>(byteplex);
     85        int content_start = decl_parser->ReadXMLInfo(model_info);
     86
     87        bitplex = new Bitplex;
    8188        buf = (LexicalStreamSet *) simd_new(sizeof(LexicalStreamSet)/PACKSIZE);
    8289
     
    9198                buf->item_stream[j][BUFFER_BLOCKS] = sentinel_value;
    9299        }
    93 #ifdef DEBUG
    94         printf("Bitspace sentinels established.\n");
    95 #endif
    96         xml_buf = b;
     100
    97101        buffer_base_pos = 0;
    98         buffer_rel_pos = b->ContentStartUnit();
    99         lexer = Lexer<C>::LexerFactory(b, buf);
    100 #ifdef DEBUG
    101         printf("Lexer created.\n");
    102 #endif
    103         lexer->AdvanceBuffer(buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
    104         x8data = &xml_buf->x8data[buffer_base_pos/PACKSIZE];
    105 #ifdef DEBUG
    106         printf("Initial lexical buffer formed.\n");
    107 #endif
    108 }
    109 
     102        buffer_rel_pos = content_start;
     103        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
     104        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
     105        x8data = byteplex->x8data;
     106        lexer = Lexer<C>::LexerFactory(m, buf);
     107        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
     108        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_limit_pos);
     109}
    110110
    111111template <CodeUnit_Base C>
    112112ParsingEngine<C>::~ParsingEngine() {
     113  model_info->~Model_Info();
     114  byteplex->~Byteplex();
     115  decl_parser->~XML_Decl_Parser<C>();
     116  bitplex->~Bitplex();
    113117  simd_delete((SIMD_type *) buf);
    114118  lexer->~Lexer_Interface();
     
    116120
    117121template <CodeUnit_Base C>
     122inline void ParsingEngine<C>::AdvanceBuffers(int preserve_pos){
     123        int advance_amt = min(preserve_pos, text_or_markup_start) - buffer_base_pos;
     124        advance_amt &= -PACKSIZE; // maintain alignment
     125        byteplex->AdvanceInputBuffer(advance_amt);
     126        buffer_base_pos += advance_amt;
     127        buffer_rel_pos -= advance_amt;
     128        buffer_limit_pos = min(BUFFER_SIZE, byteplex->units_in_buffer);
     129        int blocks_in_buffer = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE;
     130        byteplex->DoByteplex();
     131        byteplex->PreparePseudoASCII_Stream();
     132        bitplex->TransposeToBitStreams(byteplex->x8data, blocks_in_buffer);
     133        lexer->AnalyzeBuffer(bitplex->x8basis, buffer_limit_pos);
     134}
     135
     136template <CodeUnit_Base C>
    118137inline unsigned char * ParsingEngine<C>::cur() const {
    119138  return &((unsigned char *) x8data)[buffer_rel_pos];
     
    135154inline bool ParsingEngine<C>::at_EOF() const {
    136155  return (buffer_rel_pos >= buffer_limit_pos) &&
    137          (buffer_limit_pos < BUFFER_BLOCKS * BLOCKSIZE + LOOKAHEAD_POSITIONS);
     156         (buffer_limit_pos < BUFFER_SIZE);
    138157}
    139158
    140159template <CodeUnit_Base C>
    141160inline void ParsingEngine<C>::Advance(int n) {
    142   buffer_rel_pos += n;
     161        int preserve_pos;
     162        buffer_rel_pos += n;
    143163#ifndef OMIT_BITBUFFER_LIMIT_TEST_IN_ADVANCE
    144   if (buffer_rel_pos >= buffer_limit_pos) {
    145     FinalizeBuffer_action();
    146 #ifdef BUFFER_PROFILING
    147     end_BOM_interval(scanner_timer);
    148 #endif
    149     lexer->AdvanceBuffer(buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
    150     x8data = &xml_buf->x8data[buffer_base_pos/PACKSIZE];
     164  if (buffer_rel_pos >= BUFFER_SIZE) {
     165        FinalizeBuffer_action(preserve_pos);
     166        AdvanceBuffers(preserve_pos);
    151167  }
    152168#endif
     
    157173template <CodeUnit_Base C>
    158174inline void ParsingEngine<C>::ScanTo(int item) {
     175        int preserve_pos;
    159176  buffer_rel_pos = bitstream_scan(buf->item_stream[item],
    160177                                      buffer_rel_pos);
    161178  while (buffer_rel_pos >= BUFFER_SIZE) {
    162     FinalizeBuffer_action();
    163 #ifdef BUFFER_PROFILING
    164     end_BOM_interval(scanner_timer);
    165 #endif
    166     lexer->AdvanceBuffer(buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
    167 #ifdef DEBUG
    168 printf("lexer->AdvanceBuffer complete; base_pos = %i, rel_pos = %i, limit_pos = %i\n",
    169        buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
    170 #endif
    171     x8data = &xml_buf->x8data[buffer_base_pos/PACKSIZE];
    172     buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
     179        FinalizeBuffer_action(preserve_pos);
     180        AdvanceBuffers(preserve_pos);
     181        buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
    173182  }
    174183}
     
    189198    while (buffer_rel_pos >= BUFFER_BLOCKS * BLOCKSIZE) {
    190199      buffer_rel_pos = BUFFER_BLOCKS * BLOCKSIZE;
    191       FinalizeBuffer_action();
    192 #ifdef BUFFER_PROFILING
    193       end_BOM_interval(scanner_timer);
    194 #endif
    195       lexer->AdvanceBuffer(buffer_base_pos, buffer_rel_pos, buffer_limit_pos);
    196       x8data = &xml_buf->x8data[buffer_base_pos/PACKSIZE];
     200      FinalizeBuffer_action(preserve_pos);
     201          AdvanceBuffers(preserve_pos);
    197202      buffer_rel_pos = bitstream_scan(buf->item_stream[item], buffer_rel_pos);
    198203    }
     
    250255template <CodeUnit_Base C>
    251256inline void ParsingEngine<C>::Parse_EndTag() {
    252 
    253 #ifndef OMIT_ADVANCE_PRIOR_TO_EXCLUSIVE_SCAN
    254257        Advance(2); /* Skip "</". */
    255 #endif
    256258        ScanTo(NameFollow);
    257259        if (AtChar<C,'>'>(cur())) {
     
    272274template <CodeUnit_Base C>
    273275inline void ParsingEngine<C>::Parse_CDATA() {
    274 
    275276        Advance(8); /* Skip "<![CDATA". */
    276277        if (!AtChar<C,'['>(cur())) {
     
    290291template <CodeUnit_Base C>
    291292inline void ParsingEngine<C>::Parse_Reference() {
    292 
    293         /* Advance(1);  // skip "&" */
     293        Advance(1);  // skip "&"
    294294        ScanTo(NameFollow);  /* Name delimiter */
    295295        if (!AtChar<C,';'>(cur())) {
     
    304304template <CodeUnit_Base C>
    305305inline void ParsingEngine<C>::Parse_PI (){
    306 
    307306        Advance(2); /* Skip "<?". */
    308307        int target_start = AbsPos();
     
    327326template <CodeUnit_Base C>
    328327inline void ParsingEngine<C>::Parse_StartTag (){
    329 
    330328        int att_name_start;
    331329        int att_val_start;
    332330        int att_name_end, att_val_end;
    333331        unsigned char quoteCh;
     332        Advance(1);
    334333        ScanTo(NameFollow);  /* Name delimiter: WS, "/" or ">" */
    335334        ElementName_action(text_or_markup_start+1, AbsPos());
     
    443442template <CodeUnit_Base C>
    444443inline void ParsingEngine<C>::ParseContent() {
    445 
    446         text_or_markup_start = AbsPos();
    447444        DocumentStart_action();
    448445        do {
     446                text_or_markup_start = AbsPos();
    449447                ScanTo(MarkupStart); /* '<', '&', or ']' for ']]>' test */
    450448/*              if (AtChar<C,'<'>(cur())) {
     
    489487                        continue;
    490488                }
    491                 text_or_markup_start = AbsPos();
    492489        } while (1);
    493490        DocumentEnd_action();   
    494 #ifdef BUFFER_PROFILING
    495         printf("Bit stream computation.\n");
    496         dump_BOM_table(bitstream_timer);
    497         printf("Lexical stream transposition.\n");
    498         dump_BOM_table(lextranspose_timer);
    499         printf("Scanning.\n");
    500         dump_BOM_table(scanner_timer);
    501 #endif
    502 }
    503 
     491}
     492
  • trunk/src/engine.h

    r66 r73  
    11/*  engine.h - parabix parsing engine
    2     Copyright (c) 2007, Robert D. Cameron
     2    Copyright (c) 2007, 2008 Robert D. Cameron
    33    Licensed to the public under the Open Software License 3.0.
    44    Licensed to International Characters, Inc., under the Academic
     
    99#define ENGINE_H
    1010
    11 #include "xmlparam.h"
    12 #include "xmlbuffer.h"
     11#include "xmlmodel.h"
     12#include "xmldecl.h"
     13#include "byteplex.h"
    1314#include "bitlex.h"
    1415
    15 
     16#define min(x,y) ((x) <(y) ?(x) :(y) )
    1617/* A ParsingEngine is the principal class for parsing XML
    1718data.  */
     
    1920class Parser_Interface {
    2021public:
    21     ~Parser_Interface();
     22        ~Parser_Interface();
    2223        static Parser_Interface * ParserFactory(char * filename);
    2324        virtual void ParseContent() = 0;
     
    2728        XML_standalone standalone_status();
    2829        bool has_EncodingDecl();
    29         int get_Encoding_pos();
    30         int get_Encoding_lgth();
     30        unsigned char * get_Encoding();
    3131protected:
    3232        /* Co-classes */
     33        Model_Info * model_info;
     34        Byteplex * byteplex;   
     35        Bitplex * bitplex;
    3336        Lexer_Interface * lexer;
    34         XML_Buffer_Interface * xml_buf;
    3537        /* Parallel data streams for current buffer full of XML data. */
    3638        BytePack * x8data;
     
    4547class ParsingEngine : public Parser_Interface {
    4648public:
    47         ParsingEngine(XML_Buffer_Interface * b);
     49        ParsingEngine(Model_Info * m, Byteplex * b);
    4850        ~ParsingEngine();
    4951        void ParseContent();
    5052protected:
    5153
     54        XML_Decl_Parser<C> * decl_parser;
     55       
    5256        int text_or_markup_start;
    5357        /* Getters for current point/position information. */
     
    6367        void ScanTo(int lex_item);
    6468
    65         void AdvanceBuffer();
     69        void AdvanceBuffers(int preserve_pos);
    6670        /* Parsing routines. */
    6771
     
    125129                                int URI_start, int URI_end);
    126130       
    127         void FinalizeBuffer_action();
     131        /*Action routine for end of buffer events.
     132         The preserve_pos should be set to indicate the position
     133         of data that must be copied into the new buffer.*/
     134        void FinalizeBuffer_action(int& preserve_pos);
    128135
    129136};
Note: See TracChangeset for help on using the changeset viewer.