source: trunk/src/bitlex.h @ 14

Last change on this file since 14 was 14, checked in by cameron, 11 years ago

Optimized ]]> testing; end-tag scan

File size: 3.5 KB
Line 
1/*  bitlex.h - parabix lexical analysis
2    Copyright (c) 2007, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "../lib/sse_simd.h"
12
13#if BYTE_ORDER == BIG_ENDIAN
14#define sisd_sfl(blk, n) sisd_srl(blk, n)
15#define sisd_sbl(blk, n) sisd_sll(blk, n)
16#define sisd_sfli(blk, n) sisd_srli(blk, n)
17#define sisd_sbli(blk, n) sisd_slli(blk, n)
18#define simd_sbli_64(blk, n) simd_slli_64(blk, n)
19#endif
20#if BYTE_ORDER == LITTLE_ENDIAN
21#define sisd_sfl(blk, n) sisd_sll(blk, n)
22#define sisd_sbl(blk, n) sisd_srl(blk, n)
23#define sisd_sfli(blk, n) sisd_slli(blk, n)
24#define sisd_sbli(blk, n) sisd_srli(blk, n)
25#define simd_sbli_64(blk, n) simd_srli_64(blk, n)
26#endif
27
28
29/* The BytePack and the BitBlock are the two fundamental
30   types used by the parabix program for data held in
31   SIMD registers, representing, respectively, the byte-oriented
32   and bit-oriented views of character data.*/
33
34typedef SIMD_type BytePack;
35typedef SIMD_type BitBlock;
36const int PACKSIZE = sizeof(SIMD_type);
37const int BLOCKSIZE = sizeof(SIMD_type) * 8;
38
39/* Define the size of buffer used for lexical analysis/parsing. */
40const int BUFFER_BLOCKS = 16;
41const int BUFFER_PACKS = BUFFER_BLOCKS * (BLOCKSIZE/PACKSIZE);
42
43
44/* Lexical items are particular characters, character classes
45   or character sequences significant for XML parsing.  */
46
47enum lexical_item {
48  MarkupStart, CD_End_check, Hyphen, QMark, DQuote, SQuote, 
49  NonWS, NameFollow};
50
51const int LexicalItemCount = NameFollow + 1;
52
53/* The principal role of the lexical analyzer is to prepare
54   a set of parallel data streams for the parsing engine:
55   (a) an XML byte stream and (b) a set of parallel lexical
56   item streams.
57   The XML byte stream consists of one byte for each character
58   code unit in the input stream (typically the input bytes
59   themselves for most 8-bit character sets, or a pseudo-ASCII
60   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
61   The lexical item streams are bit streams that mark with a
62   1 bit the positions of occurrences of each of the lexical
63   items.
64
65   The XML byte stream is sized to hold an additional number of bytes
66   of lookahead data beyond the nominal buffer size.  The lexical
67   item streams each comprise the number of BitBlocks required
68   to hold one bit for each character code unit position in the
69   buffer, plus an additional sentinel BitBlock at the end.
70   
71*/
72const int LOOKAHEAD_POSITIONS = 8;
73const int LOOKAHEAD_PACKS = (LOOKAHEAD_POSITIONS+PACKSIZE-1)/PACKSIZE;
74const int SENTINEL_BLOCKS = 1;
75
76struct ParallelStreamSet {
77        BytePack x8data[BUFFER_PACKS+LOOKAHEAD_PACKS];
78        BitBlock item_stream[LexicalItemCount][BUFFER_BLOCKS+SENTINEL_BLOCKS];
79};
80
81#include "xmlbuffer.h"
82class Lexer {
83public:
84   Lexer(XML_Buffer *b, ParallelStreamSet *p);
85
86   /* Advance buffer and return number of code units available. */
87   virtual int AdvanceBuffer(int new_code_unit_position) = 0;
88   /* Detect a Byte Order Mark at the given position.  Return the
89      number of code unit positions (0 => no BOM, 1 => UTF-16/32 families,
90      3 => UTF-8. */
91   virtual int BOM_size(int rel_pos) = 0;
92
93protected:
94   XML_Buffer *xml_buf;
95   ParallelStreamSet *parsing_engine_data;
96   void ComputeLexicalItemStreams(int newblocks);
97   void EstablishSentinels(int code_units);
98};
99
100#ifdef BUFFER_PROFILING
101#include "../Profiling/BOM_Profiler.c"
102BOM_Table * bitstream_timer;
103BOM_Table * lextranspose_timer;
104BOM_Table * scanner_timer;
105#endif
106
107
108#endif
Note: See TracBrowser for help on using the repository browser.