source: trunk/src/bitlex.h @ 4

Last change on this file since 4 was 4, checked in by cameron, 11 years ago

Initial import of parabix-0.36

File size: 3.0 KB
Line 
1/*  bitlex.h - parabix lexical analysis
2    Copyright (c) 2007, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "../lib/sse_simd.h"
12
13/* The BytePack and the BitBlock are the two fundamental
14   types used by the parabix program for data held in
15   SIMD registers, representing, respectively, the byte-oriented
16   and bit-oriented views of character data.*/
17
18typedef SIMD_type BytePack;
19typedef SIMD_type BitBlock;
20const int PACKSIZE = sizeof(SIMD_type);
21const int BLOCKSIZE = sizeof(SIMD_type) * 8;
22
23/* Define the size of buffer used for lexical analysis/parsing. */
24const int BUFFER_BLOCKS = 16;
25const int BUFFER_PACKS = BUFFER_BLOCKS * (BLOCKSIZE/PACKSIZE);
26
27
28/* Lexical items are particular characters, character classes
29   or character sequences significant for XML parsing.  */
30
31enum lexical_item {
32  LAngle, RAngle, RBracket, Hyphen, QMark, DQuote, SQuote, 
33  NonWS, NameFollow};
34
35const int LexicalItemCount = NameFollow + 1;
36
37/* The principal role of the lexical analyzer is to prepare
38   a set of parallel data streams for the parsing engine:
39   (a) an XML byte stream and (b) a set of parallel lexical
40   item streams.
41   The XML byte stream consists of one byte for each character
42   code unit in the input stream (typically the input bytes
43   themselves for most 8-bit character sets, or a pseudo-ASCII
44   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
45   The lexical item streams are bit streams that mark with a
46   1 bit the positions of occurrences of each of the lexical
47   items.
48
49   The XML byte stream is sized to hold an additional number of bytes
50   of lookahead data beyond the nominal buffer size.  The lexical
51   item streams each comprise the number of BitBlocks required
52   to hold one bit for each character code unit position in the
53   buffer, plus an additional sentinel BitBlock at the end.
54   
55*/
56const int LOOKAHEAD_POSITIONS = 8;
57const int LOOKAHEAD_PACKS = (LOOKAHEAD_POSITIONS+PACKSIZE-1)/PACKSIZE;
58const int SENTINEL_BLOCKS = 1;
59
60struct ParallelStreamSet {
61        BytePack x8data[BUFFER_PACKS+LOOKAHEAD_PACKS];
62        BitBlock item_stream[LexicalItemCount][BUFFER_BLOCKS+SENTINEL_BLOCKS];
63};
64
65#include "xmlbuffer.h"
66class Lexer {
67public:
68   Lexer(XML_Buffer *b, ParallelStreamSet *p);
69
70   /* Advance buffer and return number of code units available. */
71   virtual int AdvanceBuffer(int new_code_unit_position) = 0;
72   /* Detect a Byte Order Mark at the given position.  Return the
73      number of code unit positions (0 => no BOM, 1 => UTF-16/32 families,
74      3 => UTF-8. */
75   virtual int BOM_size(int rel_pos) = 0;
76
77protected:
78   XML_Buffer *xml_buf;
79   ParallelStreamSet *parsing_engine_data;
80   void ComputeLexicalItemStreams(int newblocks);
81};
82
83#ifdef BUFFER_PROFILING
84#include "../Profiling/BOM_Profiler.c"
85BOM_Table * bitstream_timer;
86BOM_Table * lextranspose_timer;
87BOM_Table * scanner_timer;
88#endif
89
90
91#endif
Note: See TracBrowser for help on using the repository browser.