source: trunk/src/bitlex.c @ 15

Last change on this file since 15 was 15, checked in by cameron, 11 years ago

Bytespace scanning in XML declarations; various updates

File size: 5.9 KB
Line 
1/*  bitlex - Parabix lexical analysis common routines.
2    Copyright (c) 2007, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    These are common routines for all ASCII-family character sets.
8    They are used by the character-set specific Lexer objects
9    found in the charsets directory.
10*/
11#include "bitlex.h"
12
13#include "transpose.h"
14
15#include "stdlib.h"
16
17Lexer::Lexer(XML_Buffer *b, ParallelStreamSet *p) {
18  xml_buf = b;
19  if (posix_memalign((void **) &bit_group, sizeof(BitBlock), sizeof(BitBlockGroup)) != 0) {
20    printf("Allocation failure for local BitBlockGroup in Lexer\n");
21    exit(-1);
22  }
23  for (int i = 0; i < 8; i++) (*bit_group)[0].bit[i] = simd_const_1(0);
24  parsing_engine_data = p;
25};
26
27/* Given the bit[] array of one BitBlock each for the 8 bits of
28   an ASCII-family character representation, compute the parallel
29   lexical item streams needed for XML parsing.
30
31   WARNING: the following is generated code by charset_compiler.py.
32   Do not edit.
33
34*/
35static inline void ComputeLexicalItemBlocks(BitBlock bit[], BitBlock LexItem[]) {
36  BitBlock temp1 = simd_or(bit[0], bit[1]);
37  BitBlock temp2 = simd_andc(bit[2], bit[3]);
38  BitBlock temp3 = simd_andc(temp2, temp1);
39  BitBlock temp4 = simd_andc(bit[5], bit[4]);
40  BitBlock temp5 = simd_andc(bit[6], bit[7]);
41  BitBlock temp6 = simd_and(temp4, temp5);
42  BitBlock RefStart = simd_and(temp3, temp6);
43  BitBlock temp7 = simd_and(bit[2], bit[3]);
44  BitBlock temp8 = simd_andc(temp7, temp1);
45  BitBlock temp9 = simd_andc(bit[4], bit[5]);
46  BitBlock temp10 = simd_and(bit[6], bit[7]);
47  BitBlock temp11 = simd_and(temp9, temp10);
48  BitBlock Semicolon = simd_and(temp8, temp11);
49  BitBlock temp12 = simd_and(bit[4], bit[5]);
50  BitBlock temp13 = simd_or(bit[6], bit[7]);
51  BitBlock temp14 = simd_andc(temp12, temp13);
52  BitBlock LAngle = simd_and(temp8, temp14);
53  BitBlock temp15 = simd_and(temp12, temp5);
54  BitBlock RAngle = simd_and(temp8, temp15);
55  BitBlock temp16 = simd_andc(bit[1], bit[0]);
56  BitBlock temp17 = simd_andc(bit[3], bit[2]);
57  BitBlock temp18 = simd_and(temp16, temp17);
58  BitBlock temp19 = simd_andc(bit[7], bit[6]);
59  BitBlock temp20 = simd_and(temp12, temp19);
60  BitBlock RBracket = simd_and(temp18, temp20);
61  LexItem[Hyphen] = simd_and(temp3, temp20);
62  BitBlock temp21 = simd_and(temp12, temp10);
63  LexItem[QMark] = simd_and(temp8, temp21);
64  BitBlock Equals = simd_and(temp8, temp20);
65  BitBlock temp22 = simd_and(temp4, temp10);
66  LexItem[SQuote] = simd_and(temp3, temp22);
67  BitBlock temp23 = simd_or(bit[4], bit[5]);
68  BitBlock temp24 = simd_andc(temp5, temp23);
69  LexItem[DQuote] = simd_and(temp3, temp24);
70  BitBlock temp25 = simd_or(temp1, bit[2]);
71  BitBlock Control = simd_andc(simd_const_1(1), temp25);
72  BitBlock temp26 = simd_or(temp23, temp13);
73  BitBlock temp27 = simd_andc(temp3, temp26);
74  BitBlock temp28 = simd_or(bit[2], bit[3]);
75  BitBlock temp29 = simd_or(temp1, temp28);
76  BitBlock temp30 = simd_andc(temp20, temp29);
77  BitBlock temp31 = simd_or(temp27, temp30);
78  BitBlock temp32 = simd_and(temp9, temp19);
79  BitBlock temp33 = simd_andc(temp32, temp29);
80  BitBlock temp34 = simd_or(temp31, temp33);
81  BitBlock temp35 = simd_and(temp9, temp5);
82  BitBlock temp36 = simd_andc(temp35, temp29);
83  BitBlock WhiteSpace = simd_or(temp34, temp36);
84  BitBlock Slash = simd_and(temp3, temp21);
85  BitBlock AttScan = simd_or(LAngle, RefStart);
86  LexItem[SQuote] = simd_or(LexItem[SQuote], AttScan);
87  LexItem[DQuote] = simd_or(LexItem[DQuote], AttScan);
88
89  /* Mark potential occurrences of ']]>'  These are all actual
90     occurrences of ]]> as well as occurrences of ]] or ] at
91     the block end. Shifting the RBracket and RAngle streams in
92     negated forms ensures that a potential CD_End is not ruled
93     out at the block boundary. */
94  LexItem[CD_End_check] = simd_andc(RBracket, 
95                                simd_or(simd_sbli_64(simd_not(RBracket), 1),
96                                        simd_sbli_64(simd_not(RAngle), 2)));
97#ifdef OMIT_CD_End_check_In_Markup_Scan
98LexItem[MarkupStart] = simd_or(LAngle, RefStart);
99#endif
100#ifndef OMIT_CD_End_check_In_Markup_Scan
101  LexItem[MarkupStart] = simd_or(simd_or(LAngle, RefStart), LexItem[CD_End_check]);
102#endif
103  LexItem[NonWS] = simd_not(WhiteSpace);
104  LexItem[NameFollow] = simd_or(simd_or(simd_or(WhiteSpace, Semicolon), 
105                                        simd_or(Slash, RAngle)),
106                                simd_or(Equals, LexItem[QMark]));
107}
108
109
110/* A temporary structure for internal use in ComputeLexicalItemStreams. */
111typedef struct {
112  BitBlock LexicalItems[LexicalItemCount];
113} LexicalItemBlock;
114
115
116void Lexer::ComputeLexicalItemStreams(int new_blocks) {
117  LexicalItemBlock lx_blk[BUFFER_BLOCKS];
118  for (int i = 0; i < new_blocks; i++) {
119    s2p_bytepack(&(parsing_engine_data->x8data[i * 8]), (*bit_group)[i+1].bit);
120    ComputeLexicalItemBlocks((*bit_group)[i+1].bit, lx_blk[i].LexicalItems);
121  }
122#ifdef BUFFER_PROFILING
123  end_BOM_interval(bitstream_timer);
124  start_BOM_interval(lextranspose_timer);
125#endif
126  for (int j = MarkupStart; j < LexicalItemCount; j++) {
127    for (int i = 0; i < BUFFER_BLOCKS; i++) {
128      parsing_engine_data->item_stream[j][i] = lx_blk[i].LexicalItems[j];
129    }
130  }
131#ifdef BUFFER_PROFILING
132  end_BOM_interval(lextranspose_timer);
133  start_BOM_interval(scanner_timer);
134#endif
135}
136
137
138void Lexer::EstablishSentinels(int code_units) {
139  if (code_units < BUFFER_BLOCKS * BLOCKSIZE + LOOKAHEAD_POSITIONS) {
140    ((unsigned char *) parsing_engine_data->x8data)[code_units] = '\0';
141    if (code_units < BUFFER_BLOCKS * BLOCKSIZE) {
142      BitBlock bitstream_sentinel = 
143        sisd_sfl(simd_const_1(1), sisd_from_int(code_units % BLOCKSIZE));
144      int lastblk = code_units/BLOCKSIZE;
145      for (int j = MarkupStart; j < LexicalItemCount; j++) {
146        parsing_engine_data->item_stream[j][lastblk] =
147          simd_or(parsing_engine_data->item_stream[j][lastblk], bitstream_sentinel);
148      }
149    }
150  }
151}
Note: See TracBrowser for help on using the repository browser.