source: trunk/src/bitlex.c @ 14

Last change on this file since 14 was 14, checked in by cameron, 11 years ago

Optimized ]]> testing; end-tag scan

File size: 5.6 KB
Line 
1/*  bitlex - Parabix lexical analysis common routines.
2    Copyright (c) 2007, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    These are common routines for all ASCII-family character sets.
8    They are used by the character-set specific Lexer objects
9    found in the charsets directory.
10*/
11#include "bitlex.h"
12
13#include "transpose.h"
14
15
16
17Lexer::Lexer(XML_Buffer *b, ParallelStreamSet *p) {
18  xml_buf = b;
19  parsing_engine_data = p;
20};
21
22/* Given the bit[] array of one BitBlock each for the 8 bits of
23   an ASCII-family character representation, compute the parallel
24   lexical item streams needed for XML parsing.
25
26   WARNING: the following is generated code by charset_compiler.py.
27   Do not edit.
28
29*/
30static inline void ComputeLexicalItemBlocks(BitBlock bit[], BitBlock LexItem[]) {
31  BitBlock temp1 = simd_or(bit[0], bit[1]);
32  BitBlock temp2 = simd_andc(bit[2], bit[3]);
33  BitBlock temp3 = simd_andc(temp2, temp1);
34  BitBlock temp4 = simd_andc(bit[5], bit[4]);
35  BitBlock temp5 = simd_andc(bit[6], bit[7]);
36  BitBlock temp6 = simd_and(temp4, temp5);
37  BitBlock RefStart = simd_and(temp3, temp6);
38  BitBlock temp7 = simd_and(bit[2], bit[3]);
39  BitBlock temp8 = simd_andc(temp7, temp1);
40  BitBlock temp9 = simd_andc(bit[4], bit[5]);
41  BitBlock temp10 = simd_and(bit[6], bit[7]);
42  BitBlock temp11 = simd_and(temp9, temp10);
43  BitBlock Semicolon = simd_and(temp8, temp11);
44  BitBlock temp12 = simd_and(bit[4], bit[5]);
45  BitBlock temp13 = simd_or(bit[6], bit[7]);
46  BitBlock temp14 = simd_andc(temp12, temp13);
47  BitBlock LAngle = simd_and(temp8, temp14);
48  BitBlock temp15 = simd_and(temp12, temp5);
49  BitBlock RAngle = simd_and(temp8, temp15);
50  BitBlock temp16 = simd_andc(bit[1], bit[0]);
51  BitBlock temp17 = simd_andc(bit[3], bit[2]);
52  BitBlock temp18 = simd_and(temp16, temp17);
53  BitBlock temp19 = simd_andc(bit[7], bit[6]);
54  BitBlock temp20 = simd_and(temp12, temp19);
55  BitBlock RBracket = simd_and(temp18, temp20);
56  LexItem[Hyphen] = simd_and(temp3, temp20);
57  BitBlock temp21 = simd_and(temp12, temp10);
58  LexItem[QMark] = simd_and(temp8, temp21);
59  BitBlock Equals = simd_and(temp8, temp20);
60  BitBlock temp22 = simd_and(temp4, temp10);
61  LexItem[SQuote] = simd_and(temp3, temp22);
62  BitBlock temp23 = simd_or(bit[4], bit[5]);
63  BitBlock temp24 = simd_andc(temp5, temp23);
64  LexItem[DQuote] = simd_and(temp3, temp24);
65  BitBlock temp25 = simd_or(temp1, bit[2]);
66  BitBlock Control = simd_andc(simd_const_1(1), temp25);
67  BitBlock temp26 = simd_or(temp23, temp13);
68  BitBlock temp27 = simd_andc(temp3, temp26);
69  BitBlock temp28 = simd_or(bit[2], bit[3]);
70  BitBlock temp29 = simd_or(temp1, temp28);
71  BitBlock temp30 = simd_andc(temp20, temp29);
72  BitBlock temp31 = simd_or(temp27, temp30);
73  BitBlock temp32 = simd_and(temp9, temp19);
74  BitBlock temp33 = simd_andc(temp32, temp29);
75  BitBlock temp34 = simd_or(temp31, temp33);
76  BitBlock temp35 = simd_and(temp9, temp5);
77  BitBlock temp36 = simd_andc(temp35, temp29);
78  BitBlock WhiteSpace = simd_or(temp34, temp36);
79  BitBlock Slash = simd_and(temp3, temp21);
80  BitBlock AttScan = simd_or(LAngle, RefStart);
81  LexItem[SQuote] = simd_or(LexItem[SQuote], AttScan);
82  LexItem[DQuote] = simd_or(LexItem[DQuote], AttScan);
83
84  /* Mark potential occurrences of ']]>'  These are all actual
85     occurrences of ]]> as well as occurrences of ]] or ] at
86     the block end. Shifting the RBracket and RAngle streams in
87     negated forms ensures that a potential CD_End is not ruled
88     out at the block boundary. */
89  LexItem[CD_End_check] = simd_andc(RBracket, 
90                                simd_or(simd_sbli_64(simd_not(RBracket), 1),
91                                        simd_sbli_64(simd_not(RAngle), 2)));
92#ifdef OMIT_CD_End_check_In_Markup_Scan
93LexItem[MarkupStart] = simd_or(LAngle, RefStart);
94#endif
95#ifndef OMIT_CD_End_check_In_Markup_Scan
96  LexItem[MarkupStart] = simd_or(simd_or(LAngle, RefStart), LexItem[CD_End_check]);
97#endif
98  LexItem[NonWS] = simd_not(WhiteSpace);
99  LexItem[NameFollow] = simd_or(simd_or(simd_or(WhiteSpace, Semicolon), 
100                                        simd_or(Slash, RAngle)),
101                                simd_or(Equals, LexItem[QMark]));
102}
103
104
105/* A temporary structure for internal use in ComputeLexicalItemStreams. */
106typedef struct {
107  BitBlock bit[8];
108  BitBlock LexicalItems[LexicalItemCount];
109} LexicalItemBlock;
110
111
112void Lexer::ComputeLexicalItemStreams(int new_blocks) {
113  LexicalItemBlock lx_blk[BUFFER_BLOCKS];
114  for (int i = 0; i < new_blocks; i++) {
115    s2p_bytepack(&(parsing_engine_data->x8data[i * 8]), lx_blk[i].bit);
116    ComputeLexicalItemBlocks(lx_blk[i].bit, lx_blk[i].LexicalItems);
117  }
118#ifdef BUFFER_PROFILING
119  end_BOM_interval(bitstream_timer);
120  start_BOM_interval(lextranspose_timer);
121#endif
122  for (int j = MarkupStart; j < LexicalItemCount; j++) {
123    for (int i = 0; i < BUFFER_BLOCKS; i++) {
124      parsing_engine_data->item_stream[j][i] = lx_blk[i].LexicalItems[j];
125    }
126  }
127#ifdef BUFFER_PROFILING
128  end_BOM_interval(lextranspose_timer);
129  start_BOM_interval(scanner_timer);
130#endif
131}
132
133
134void Lexer::EstablishSentinels(int code_units) {
135  if (code_units < BUFFER_BLOCKS * BLOCKSIZE + LOOKAHEAD_POSITIONS) {
136    ((unsigned char *) parsing_engine_data->x8data)[code_units] = '\0';
137    if (code_units < BUFFER_BLOCKS * BLOCKSIZE) {
138      BitBlock bitstream_sentinel = 
139        sisd_sfl(simd_const_1(1), sisd_from_int(code_units % BLOCKSIZE));
140      int lastblk = code_units/BLOCKSIZE;
141      for (int j = MarkupStart; j < LexicalItemCount; j++) {
142        parsing_engine_data->item_stream[j][lastblk] =
143          simd_or(parsing_engine_data->item_stream[j][lastblk], bitstream_sentinel);
144      }
145    }
146  }
147}
Note: See TracBrowser for help on using the repository browser.