source: trunk/src/bitlex.h @ 35

Last change on this file since 35 was 15, checked in by cameron, 11 years ago

Bytespace scanning in XML declarations; various updates

File size: 3.7 KB
Line 
1/*  bitlex.h - parabix lexical analysis
2    Copyright (c) 2007, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7*/
8#ifndef BITLEX_H
9#define BITLEX_H
10
11#include "../lib/sse_simd.h"
12
13#if BYTE_ORDER == BIG_ENDIAN
14#define sisd_sfl(blk, n) sisd_srl(blk, n)
15#define sisd_sbl(blk, n) sisd_sll(blk, n)
16#define sisd_sfli(blk, n) sisd_srli(blk, n)
17#define sisd_sbli(blk, n) sisd_slli(blk, n)
18#define simd_sbli_64(blk, n) simd_slli_64(blk, n)
19#endif
20#if BYTE_ORDER == LITTLE_ENDIAN
21#define sisd_sfl(blk, n) sisd_sll(blk, n)
22#define sisd_sbl(blk, n) sisd_srl(blk, n)
23#define sisd_sfli(blk, n) sisd_slli(blk, n)
24#define sisd_sbli(blk, n) sisd_srli(blk, n)
25#define simd_sbli_64(blk, n) simd_srli_64(blk, n)
26#endif
27
28
29/* The BytePack and the BitBlock are the two fundamental
30   types used by the parabix program for data held in
31   SIMD registers, representing, respectively, the byte-oriented
32   and bit-oriented views of character data.*/
33
34typedef SIMD_type BytePack;
35typedef SIMD_type BitBlock;
36const int PACKSIZE = sizeof(SIMD_type);
37const int BLOCKSIZE = sizeof(SIMD_type) * 8;
38
39/* Define the size of buffer used for lexical analysis/parsing. */
40const int BUFFER_BLOCKS = 16;
41const int BUFFER_PACKS = BUFFER_BLOCKS * (BLOCKSIZE/PACKSIZE);
42
43
44/* Lexical items are particular characters, character classes
45   or character sequences significant for XML parsing.  */
46
47enum lexical_item {
48  minLexicalItem = 0,
49  MarkupStart = minLexicalItem, CD_End_check, Hyphen, QMark,
50  DQuote, SQuote, NonWS, NameFollow,
51  maxLexicalItem = NameFollow};
52
53const int LexicalItemCount = maxLexicalItem + 1;
54
55/* The principal role of the lexical analyzer is to prepare
56   a set of parallel data streams for the parsing engine:
57   (a) an XML byte stream and (b) a set of parallel lexical
58   item streams.
59   The XML byte stream consists of one byte for each character
60   code unit in the input stream (typically the input bytes
61   themselves for most 8-bit character sets, or a pseudo-ASCII
62   byte for 16-bit or 32-bit sets such as UTF-16, or UTF-32).
63   The lexical item streams are bit streams that mark with a
64   1 bit the positions of occurrences of each of the lexical
65   items.
66
67   The XML byte stream is sized to hold an additional number of bytes
68   of lookahead data beyond the nominal buffer size.  The lexical
69   item streams each comprise the number of BitBlocks required
70   to hold one bit for each character code unit position in the
71   buffer, plus an additional sentinel BitBlock at the end.
72   
73*/
74const int LOOKAHEAD_POSITIONS = 12;
75const int LOOKAHEAD_PACKS = (LOOKAHEAD_POSITIONS+PACKSIZE-1)/PACKSIZE;
76const int SENTINEL_BLOCKS = 1;
77
78struct ParallelStreamSet {
79        BytePack x8data[BUFFER_PACKS+LOOKAHEAD_PACKS];
80        BitBlock item_stream[LexicalItemCount][BUFFER_BLOCKS+SENTINEL_BLOCKS];
81};
82
83struct BitBlockGroup {
84  BitBlock bit[8];
85};
86
87#include "xmlbuffer.h"
88class Lexer {
89public:
90   Lexer(XML_Buffer *b, ParallelStreamSet *p);
91
92   /* Advance buffer and return number of code units available. */
93   virtual int AdvanceBuffer(int new_code_unit_position) = 0;
94   /* Detect a Byte Order Mark at the given position.  Return the
95      number of code unit positions (0 => no BOM, 1 => UTF-16/32 families,
96      3 => UTF-8. */
97   virtual int BOM_size(int rel_pos) = 0;
98
99protected:
100   XML_Buffer *xml_buf;
101   BitBlockGroup *(bit_group[BUFFER_BLOCKS+1]);
102   ParallelStreamSet *parsing_engine_data;
103   void ComputeLexicalItemStreams(int newblocks);
104   void EstablishSentinels(int code_units);
105};
106
107#ifdef BUFFER_PROFILING
108#include "../Profiling/BOM_Profiler.c"
109BOM_Table * bitstream_timer;
110BOM_Table * lextranspose_timer;
111BOM_Table * scanner_timer;
112#endif
113
114
115#endif
Note: See TracBrowser for help on using the repository browser.