Changeset 36 for trunk/src/bitlex.h


Ignore:
Timestamp:
Feb 10, 2008, 6:12:06 AM (11 years ago)
Author:
cameron
Message:

Charset Architecture: Lexer Factory/bit streams

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/bitlex.h

    r15 r36  
    1 /*  bitlex.h - parabix lexical analysis
    2     Copyright (c) 2007, Robert D. Cameron.
     1/*  bitlex.h - parabix lexical analysis (bit streams)
     2    Copyright (c) 2007, 2008, Robert D. Cameron.
    33    Licensed to the public under the Open Software License 3.0.
    44    Licensed to International Characters, Inc., under the Academic
     
    99#define BITLEX_H
    1010
    11 #include "../lib/sse_simd.h"
    12 
    13 #if BYTE_ORDER == BIG_ENDIAN
    14 #define sisd_sfl(blk, n) sisd_srl(blk, n)
    15 #define sisd_sbl(blk, n) sisd_sll(blk, n)
    16 #define sisd_sfli(blk, n) sisd_srli(blk, n)
    17 #define sisd_sbli(blk, n) sisd_slli(blk, n)
    18 #define simd_sbli_64(blk, n) simd_slli_64(blk, n)
    19 #endif
    20 #if BYTE_ORDER == LITTLE_ENDIAN
    21 #define sisd_sfl(blk, n) sisd_sll(blk, n)
    22 #define sisd_sbl(blk, n) sisd_srl(blk, n)
    23 #define sisd_sfli(blk, n) sisd_slli(blk, n)
    24 #define sisd_sbli(blk, n) sisd_srli(blk, n)
    25 #define simd_sbli_64(blk, n) simd_srli_64(blk, n)
    26 #endif
    27 
    28 
    29 /* The BytePack and the BitBlock are the two fundamental
    30    types used by the parabix program for data held in
    31    SIMD registers, representing, respectively, the byte-oriented
    32    and bit-oriented views of character data.*/
    33 
    34 typedef SIMD_type BytePack;
    35 typedef SIMD_type BitBlock;
    36 const int PACKSIZE = sizeof(SIMD_type);
    37 const int BLOCKSIZE = sizeof(SIMD_type) * 8;
    38 
    39 /* Define the size of buffer used for lexical analysis/parsing. */
    40 const int BUFFER_BLOCKS = 16;
    41 const int BUFFER_PACKS = BUFFER_BLOCKS * (BLOCKSIZE/PACKSIZE);
    42 
     11#include "xmlparam.h"
     12#include "xmlbuffer.h"
    4313
    4414/* Lexical items are particular characters, character classes
     
    4616
    4717enum lexical_item {
    48   minLexicalItem = 0,
    49   MarkupStart = minLexicalItem, CD_End_check, Hyphen, QMark,
    50   DQuote, SQuote, NonWS, NameFollow,
    51   maxLexicalItem = NameFollow};
     18        minLexicalItem = 0,
     19        NonWS = minLexicalItem,
     20        MarkupStart, CD_End_check, Hyphen, QMark,
     21        Quote, NameFollow,
     22        maxLexicalItem = NameFollow};
    5223
    5324const int LexicalItemCount = maxLexicalItem + 1;
     25
     26
     27
    5428
    5529/* The principal role of the lexical analyzer is to prepare
     
    6539   items.
    6640
    67    The XML byte stream is sized to hold an additional number of bytes
    68    of lookahead data beyond the nominal buffer size.  The lexical
    69    item streams each comprise the number of BitBlocks required
    70    to hold one bit for each character code unit position in the
    71    buffer, plus an additional sentinel BitBlock at the end.
    72    
    73 */
    74 const int LOOKAHEAD_POSITIONS = 12;
    75 const int LOOKAHEAD_PACKS = (LOOKAHEAD_POSITIONS+PACKSIZE-1)/PACKSIZE;
    76 const int SENTINEL_BLOCKS = 1;
     41   A BitBlockBasis is a set of 8 parallel bit blocks for
     42   that represent a block of 8-bit code units in bit-parallel
     43   form. */
    7744
    78 struct ParallelStreamSet {
    79         BytePack x8data[BUFFER_PACKS+LOOKAHEAD_PACKS];
    80         BitBlock item_stream[LexicalItemCount][BUFFER_BLOCKS+SENTINEL_BLOCKS];
     45struct BitBlockBasis {
     46        BitBlock bit[8];
    8147};
    8248
    83 struct BitBlockGroup {
    84   BitBlock bit[8];
     49/* A BitStreamBuffer is a bit stream of BUFFER_BLOCKS consecutive
     50   blocks, followed by a sentinel block to terminate bit scans. */
     51
     52const int SENTINEL_BLOCKS = 1;
     53typedef BitBlock BitStreamBuffer[BUFFER_BLOCKS+SENTINEL_BLOCKS];
     54
     55struct LexicalStreamSet {
     56        BitStreamBuffer item_stream[LexicalItemCount];
    8557};
    8658
    87 #include "xmlbuffer.h"
    88 class Lexer {
     59
     60class Lexer_Interface {
    8961public:
    90    Lexer(XML_Buffer *b, ParallelStreamSet *p);
    91 
    92    /* Advance buffer and return number of code units available. */
    93    virtual int AdvanceBuffer(int new_code_unit_position) = 0;
    94    /* Detect a Byte Order Mark at the given position.  Return the
    95       number of code unit positions (0 => no BOM, 1 => UTF-16/32 families,
    96       3 => UTF-8. */
    97    virtual int BOM_size(int rel_pos) = 0;
     62        Lexer_Interface(XML_Buffer_Interface *b, LexicalStreamSet *l);
     63        void AdvanceBuffer(int& base_pos, int& rel_pos, int& limit_pos);
    9864
    9965protected:
    100    XML_Buffer *xml_buf;
    101    BitBlockGroup *(bit_group[BUFFER_BLOCKS+1]);
    102    ParallelStreamSet *parsing_engine_data;
    103    void ComputeLexicalItemStreams(int newblocks);
    104    void EstablishSentinels(int code_units);
     66        XML_Buffer_Interface *xml_buf;
     67        void TransposeToBitStreams();
     68        virtual void Do_XML_10_WS_Control() = 0;
     69        virtual void Do_MarkupStreams() = 0;
     70        virtual void Do_XML_11_WS_Control() = 0;
     71        virtual void Do_CharsetValidation() = 0;
     72        int lexer_base_pos;
     73        BitBlockBasis * x8basis;
     74        LexicalStreamSet * parsing_engine_data;
     75        int code_units;
     76        int data_blocks;
    10577};
     78
     79template <CodeUnit_Base C>
     80class Lexer : public Lexer_Interface {
     81public:
     82        static Lexer_Interface * LexerFactory(XML_Buffer_Interface *b, LexicalStreamSet *l);
     83
     84protected:
     85        Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     86        void Do_XML_10_WS_Control();
     87        void Do_MarkupStreams();
     88        virtual void Do_XML_11_WS_Control() = 0;
     89        virtual void Do_CharsetValidation() = 0;
     90};
     91
     92class UTF_8_Lexer : public Lexer<ASCII> {
     93public:
     94        UTF_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     95        void Do_XML_11_WS_Control();
     96        void Do_CharsetValidation();
     97};
     98
     99class ASCII_7_Lexer : public Lexer<ASCII> {
     100public:
     101        ASCII_7_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     102        void Do_XML_11_WS_Control();
     103        void Do_CharsetValidation();
     104};
     105
     106class EASCII_8_Lexer : public Lexer<ASCII> {
     107public:
     108        EASCII_8_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     109        void Do_XML_11_WS_Control();
     110        void Do_CharsetValidation();
     111};
     112
     113/* 16-bit ASCII-based character sets: UTF-16 and UCS-2 families.
     114   Whitespace and control processing is common to these families,
     115   but character set validation differs for codepoints D800-DFFF,
     116   used for surrogate pairs in UTF-16 and prohibitied in UCS-2. */
     117class U16_Lexer : public Lexer<ASCII> {
     118public:
     119        U16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     120        void Do_XML_11_WS_Control();
     121        virtual void Do_CharsetValidation() = 0;
     122};
     123
     124class UTF_16_Lexer : public U16_Lexer {
     125public:
     126        UTF_16_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     127        void Do_CharsetValidation();
     128};
     129
     130class UCS_2_Lexer : public U16_Lexer {
     131public:
     132        UCS_2_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     133        void Do_CharsetValidation();
     134};
     135
     136class UTF_32_Lexer : public Lexer<ASCII> {
     137public:
     138        UTF_32_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     139        void Do_XML_11_WS_Control();
     140        void Do_CharsetValidation();
     141};
     142
     143class EBCDIC_Lexer: public Lexer<EBCDIC> {
     144public:
     145        EBCDIC_Lexer(XML_Buffer_Interface *b, LexicalStreamSet *l);
     146        void Do_XML_11_WS_Control();
     147        void Do_CharsetValidation();
     148};
     149
    106150
    107151#ifdef BUFFER_PROFILING
Note: See TracChangeset for help on using the changeset viewer.