1 | /* bitlex - Lexical Item Stream Module. |
---|
2 | Copyright (c) 2007, 2008, Robert D. Cameron. |
---|
3 | Licensed to the public under the Open Software License 3.0. |
---|
4 | Licensed to International Characters, Inc., under the Academic |
---|
5 | Free License 3.0. |
---|
6 | |
---|
7 | */ |
---|
8 | |
---|
9 | #include "bitlex.h" |
---|
10 | #include "../lib/lib_simd.h" |
---|
11 | #include "xml_error.h" |
---|
12 | |
---|
13 | #ifdef CODE_CLOCKING |
---|
14 | /* |
---|
15 | #include "../codeclocker/clocker/code_clocker_session.h" |
---|
16 | Code_Clocker * transpose_clocker; |
---|
17 | Code_Clocker * WS_Control_clocker; |
---|
18 | Code_Clocker * MarkupStreams_clocker; |
---|
19 | Code_Clocker * char_validation_clocker; |
---|
20 | */ |
---|
21 | #endif |
---|
22 | |
---|
23 | |
---|
24 | Lexer_Interface::Lexer_Interface(Entity_Info * e, LexicalStreamSet *l) { |
---|
25 | entity_Info = e; |
---|
26 | parsing_engine_data = l; |
---|
27 | |
---|
28 | x8basis = (BitBlockBasis *) simd_new(BUFFER_SIZE/PACKSIZE); |
---|
29 | validation_stream = (BitBlock *) simd_new(BUFFER_BLOCKS+SENTINEL_BLOCKS); |
---|
30 | #ifdef TEMPLATED_SIMD_LIB |
---|
31 | validation_stream[BUFFER_BLOCKS] = simd<1>::constant<1>(); |
---|
32 | #endif |
---|
33 | #ifndef TEMPLATED_SIMD_LIB |
---|
34 | validation_stream[BUFFER_BLOCKS] = simd_const_1(1); |
---|
35 | #endif |
---|
36 | }; |
---|
37 | |
---|
38 | |
---|
39 | Lexer_Interface::~Lexer_Interface() { |
---|
40 | simd_delete((SIMD_type *) validation_stream); |
---|
41 | }; |
---|
42 | |
---|
43 | |
---|
44 | template <> |
---|
45 | Lexer_Interface * Lexer<ASCII>::LexerFactory(Entity_Info * e, LexicalStreamSet *l) { |
---|
46 | #ifdef CODE_CLOCKING |
---|
47 | /* |
---|
48 | WS_Control_clocker = register_Code_Clocker("WS", "WhiteSpace/Control\n"); |
---|
49 | MarkupStreams_clocker = register_Code_Clocker("bitlex", "Markup streams\n"); |
---|
50 | char_validation_clocker = register_Code_Clocker("charcheck", "Character validation\n"); |
---|
51 | */ |
---|
52 | #endif |
---|
53 | if (!(e->has_encoding_decl)) { |
---|
54 | // Must be UTF-8 or UTF-16; UTF-16 requires a ByteOrderMark. |
---|
55 | if (e->code_unit_size == SingleByte) return new UTF_8_Lexer(e, l); |
---|
56 | else if ((e->code_unit_size == DoubleByte)) |
---|
57 | if (e->BOM_units == 1) return new UTF_16_Lexer(e, l); |
---|
58 | else NoEncodingError("UTF-16 implied but no byte order found."); |
---|
59 | else NoEncodingError("UTF-32 without an encoding declaration.\n"); |
---|
60 | } |
---|
61 | else { |
---|
62 | int lgth = strlen((const char *) e->encoding); |
---|
63 | CodeUnit_ByteOrder order = e->byte_order; |
---|
64 | switch (e->code_unit_size) { |
---|
65 | case SingleByte: |
---|
66 | if ((lgth == 5) && at_UTF_8(e->encoding)) |
---|
67 | return new UTF_8_Lexer(e, l); |
---|
68 | else if ((lgth == 5) && at_ASCII(e->encoding)) |
---|
69 | return new ASCII_7_Lexer(e, l); |
---|
70 | else if ((lgth == 6) && at_Latin1(e->encoding)) |
---|
71 | return new EASCII_8_Lexer(e, l); |
---|
72 | /* Really need a table-based lookup here */ |
---|
73 | else EncodingError("8-bit", e->encoding, lgth); |
---|
74 | case DoubleByte: |
---|
75 | if (e->BOM_units == 1) |
---|
76 | if ((lgth == 6) && at_UTF_16(e->encoding)) |
---|
77 | return new UTF_16_Lexer(e, l); |
---|
78 | else if ((lgth == 5) && at_UCS_2(e->encoding)) |
---|
79 | return new UCS_2_Lexer(e, l); |
---|
80 | else EncodingError("16-bit", e->encoding, lgth); |
---|
81 | else if (order == BigEndian) |
---|
82 | if ((lgth == 8) && at_UTF_16BE(e->encoding)) |
---|
83 | return new UTF_16_Lexer(e, l); |
---|
84 | else if ((lgth == 7) && at_UCS_2BE(e->encoding)) |
---|
85 | return new UCS_2_Lexer(e, l); |
---|
86 | else EncodingError("16BE", e->encoding, lgth); |
---|
87 | else /*if (order == LittleEndian)*/ |
---|
88 | if ((lgth == 8) && at_UTF_16LE(e->encoding)) |
---|
89 | return new UTF_16_Lexer(e, l); |
---|
90 | else if ((lgth == 7) && at_UCS_2LE(e->encoding)) |
---|
91 | return new UCS_2_Lexer(e, l); |
---|
92 | else EncodingError("16LE", e->encoding, lgth); |
---|
93 | case QuadByte: |
---|
94 | if (e->BOM_units == 1) |
---|
95 | if ((lgth == 6) && at_UTF_32(e->encoding)) |
---|
96 | return new UTF_32_Lexer(e, l); |
---|
97 | else if ((lgth == 5) && at_UCS_4(e->encoding)) |
---|
98 | return new UTF_32_Lexer(e, l); |
---|
99 | else EncodingError("32-bit", e->encoding, lgth); |
---|
100 | else if (order == BigEndian) |
---|
101 | if ((lgth == 8) && at_UTF_32BE(e->encoding)) |
---|
102 | return new UTF_32_Lexer(e, l); |
---|
103 | else if ((lgth == 7) && at_UCS_4BE(e->encoding)) |
---|
104 | return new UTF_32_Lexer(e, l); |
---|
105 | else EncodingError("32BE", e->encoding, lgth); |
---|
106 | else if (order == LittleEndian) |
---|
107 | if ((lgth == 8) && at_UTF_32LE(e->encoding)) |
---|
108 | return new UTF_32_Lexer(e, l); |
---|
109 | else if ((lgth == 7) && at_UCS_4LE(e->encoding)) |
---|
110 | return new UTF_32_Lexer(e, l); |
---|
111 | else EncodingError("32LE", e->encoding, lgth); |
---|
112 | else EncodingError("32-bit", e->encoding, lgth); |
---|
113 | } |
---|
114 | } |
---|
115 | } |
---|
116 | |
---|
117 | template <> |
---|
118 | Lexer_Interface * Lexer<EBCDIC>::LexerFactory(Entity_Info * e, LexicalStreamSet *l) { |
---|
119 | if (!(e->has_encoding_decl)) { |
---|
120 | // Must be UTF-8 or UTF-16; UTF-16 requires a ByteOrderMark. |
---|
121 | NoEncodingError("EBCDIC-family inferred, but no encoding declaration present.\n"); |
---|
122 | } |
---|
123 | else { |
---|
124 | int lgth = strlen((const char *) e->encoding); |
---|
125 | /* Really need a table-based lookup here */ |
---|
126 | if ((lgth == 6) && at_EBCDIC(e->encoding)) |
---|
127 | return new EBCDIC_Lexer(e, l); |
---|
128 | else EncodingError("EBCDIC family", e->encoding, lgth); |
---|
129 | } |
---|
130 | } |
---|
131 | |
---|
132 | template <CodeUnit_Base C> |
---|
133 | Lexer<C>::Lexer(Entity_Info * e, LexicalStreamSet *l) : Lexer_Interface::Lexer_Interface(e, l) { |
---|
134 | } |
---|
135 | |
---|
136 | UTF_8_Lexer::UTF_8_Lexer(Entity_Info * e, LexicalStreamSet *l) : Lexer<ASCII>::Lexer(e, l) { |
---|
137 | } |
---|
138 | |
---|
139 | ASCII_7_Lexer::ASCII_7_Lexer(Entity_Info * e, LexicalStreamSet *l) : Lexer<ASCII>::Lexer(e, l) { |
---|
140 | } |
---|
141 | |
---|
142 | EASCII_8_Lexer::EASCII_8_Lexer(Entity_Info * e, LexicalStreamSet *l) : Lexer<ASCII>::Lexer(e, l) { |
---|
143 | } |
---|
144 | |
---|
145 | U16_Lexer::U16_Lexer(Entity_Info * e, LexicalStreamSet *l) : Lexer<ASCII>::Lexer(e, l) { |
---|
146 | } |
---|
147 | |
---|
148 | UTF_16_Lexer::UTF_16_Lexer(Entity_Info * e, LexicalStreamSet *l) : U16_Lexer::U16_Lexer(e, l) { |
---|
149 | } |
---|
150 | |
---|
151 | UCS_2_Lexer::UCS_2_Lexer(Entity_Info * e, LexicalStreamSet *l) : U16_Lexer::U16_Lexer(e, l) { |
---|
152 | } |
---|
153 | |
---|
154 | UTF_32_Lexer::UTF_32_Lexer(Entity_Info * e, LexicalStreamSet *l) : Lexer<ASCII>::Lexer(e, l) { |
---|
155 | } |
---|
156 | |
---|
157 | EBCDIC_Lexer::EBCDIC_Lexer(Entity_Info * e, LexicalStreamSet *l) : Lexer<EBCDIC>::Lexer(e, l) { |
---|
158 | } |
---|
159 | |
---|
160 | template <CodeUnit_Base C> |
---|
161 | static inline void WS_Control_Blocks(BitBlock bit[], BitBlock& WS, BitBlock& Control); |
---|
162 | |
---|
163 | template <> |
---|
164 | inline void WS_Control_Blocks<ASCII>(BitBlock bit[], BitBlock& WS, BitBlock& Control) { |
---|
165 | BitBlock temp1 = simd_or(bit[0], bit[1]); |
---|
166 | BitBlock temp2 = simd_or(temp1, bit[2]); |
---|
167 | #ifdef TEMPLATED_SIMD_LIB |
---|
168 | Control = simd_andc(simd<1>::constant<1>(), temp2); |
---|
169 | #endif |
---|
170 | #ifndef TEMPLATED_SIMD_LIB |
---|
171 | Control = simd_andc(simd_const_1(1), temp2); |
---|
172 | #endif |
---|
173 | BitBlock temp3 = simd_or(bit[2], bit[3]); |
---|
174 | BitBlock temp4 = simd_or(temp1, temp3); |
---|
175 | BitBlock temp5 = simd_and(bit[4], bit[5]); |
---|
176 | BitBlock temp6 = simd_andc(bit[7], bit[6]); |
---|
177 | BitBlock temp7 = simd_and(temp5, temp6); |
---|
178 | BitBlock CR = simd_andc(temp7, temp4); |
---|
179 | BitBlock temp8 = simd_andc(bit[4], bit[5]); |
---|
180 | BitBlock temp9 = simd_andc(bit[6], bit[7]); |
---|
181 | BitBlock temp10 = simd_and(temp8, temp9); |
---|
182 | BitBlock LF = simd_andc(temp10, temp4); |
---|
183 | BitBlock temp11 = simd_and(temp8, temp6); |
---|
184 | BitBlock HT = simd_andc(temp11, temp4); |
---|
185 | BitBlock temp12 = simd_andc(bit[2], bit[3]); |
---|
186 | BitBlock temp13 = simd_andc(temp12, temp1); |
---|
187 | BitBlock temp14 = simd_or(bit[4], bit[5]); |
---|
188 | BitBlock temp15 = simd_or(bit[6], bit[7]); |
---|
189 | BitBlock temp16 = simd_or(temp14, temp15); |
---|
190 | BitBlock SP = simd_andc(temp13, temp16); |
---|
191 | WS = simd_or(simd_or(CR, LF), simd_or(HT, SP)); |
---|
192 | } |
---|
193 | |
---|
194 | template <> |
---|
195 | inline void WS_Control_Blocks<EBCDIC>(BitBlock bit[], BitBlock& WS, BitBlock& Control) { |
---|
196 | BitBlock temp1 = simd_or(bit[0], bit[1]); |
---|
197 | BitBlock temp2 = simd_or(bit[2], bit[3]); |
---|
198 | BitBlock temp3 = simd_or(temp1, temp2); |
---|
199 | BitBlock temp4 = simd_or(bit[4], bit[5]); |
---|
200 | BitBlock temp5 = simd_or(temp3, temp4); |
---|
201 | BitBlock temp6 = simd_and(bit[2], bit[3]); |
---|
202 | BitBlock temp7 = simd_andc(temp6, temp1); |
---|
203 | BitBlock temp8 = simd_andc(bit[5], bit[4]); |
---|
204 | BitBlock temp9 = simd_and(bit[6], bit[7]); |
---|
205 | BitBlock temp10 = simd_and(temp8, temp9); |
---|
206 | BitBlock temp11 = simd_and(temp7, temp10); |
---|
207 | BitBlock temp12 = simd_andc(temp5, temp11); |
---|
208 | BitBlock temp13 = simd_andc(bit[2], bit[3]); |
---|
209 | BitBlock temp14 = simd_andc(temp13, temp1); |
---|
210 | BitBlock temp15 = simd_and(bit[4], bit[5]); |
---|
211 | BitBlock temp16 = simd_and(temp14, temp15); |
---|
212 | BitBlock temp17 = simd_andc(bit[6], bit[7]); |
---|
213 | BitBlock temp18 = simd_andc(temp16, temp17); |
---|
214 | BitBlock temp19 = simd_andc(temp12, temp18); |
---|
215 | BitBlock temp20 = simd_andc(bit[3], bit[2]); |
---|
216 | BitBlock temp21 = simd_andc(temp20, temp1); |
---|
217 | BitBlock temp22 = simd_and(temp8, temp17); |
---|
218 | BitBlock temp23 = simd_and(temp21, temp22); |
---|
219 | BitBlock temp24 = simd_andc(temp19, temp23); |
---|
220 | BitBlock temp25 = simd_or(temp1, bit[2]); |
---|
221 | BitBlock temp26 = simd_or(bit[5], temp9); |
---|
222 | BitBlock temp27 = simd_and(bit[4], temp26); |
---|
223 | #ifdef TEMPLATED_SIMD_LIB |
---|
224 | BitBlock temp28 = simd_andc(simd<1>::constant<1>(), temp4); |
---|
225 | #endif |
---|
226 | #ifndef TEMPLATED_SIMD_LIB |
---|
227 | BitBlock temp28 = simd_andc(simd_const_1(1), temp4); |
---|
228 | #endif |
---|
229 | BitBlock temp29 = simd_if(bit[3], temp27, temp28); |
---|
230 | BitBlock temp30 = simd_andc(temp29, temp25); |
---|
231 | BitBlock temp31 = simd_andc(temp24, temp30); |
---|
232 | BitBlock temp32 = simd_andc(temp15, bit[6]); |
---|
233 | BitBlock temp33 = simd_and(temp7, temp32); |
---|
234 | BitBlock temp34 = simd_andc(temp31, temp33); |
---|
235 | BitBlock temp35 = simd_andc(temp17, temp4); |
---|
236 | BitBlock temp36 = simd_and(temp7, temp35); |
---|
237 | BitBlock temp37 = simd_andc(temp34, temp36); |
---|
238 | BitBlock temp38 = simd_and(temp8, bit[6]); |
---|
239 | BitBlock temp39 = simd_and(temp14, temp38); |
---|
240 | BitBlock temp40 = simd_andc(temp37, temp39); |
---|
241 | BitBlock temp41 = simd_andc(bit[4], bit[5]); |
---|
242 | BitBlock temp42 = simd_andc(temp41, bit[6]); |
---|
243 | BitBlock temp43 = simd_and(temp21, temp42); |
---|
244 | BitBlock temp44 = simd_andc(temp40, temp43); |
---|
245 | BitBlock temp45 = simd_and(temp15, temp9); |
---|
246 | BitBlock temp46 = simd_and(temp7, temp45); |
---|
247 | BitBlock temp47 = simd_andc(temp44, temp46); |
---|
248 | BitBlock temp48 = simd_and(temp21, temp15); |
---|
249 | BitBlock temp49 = simd_andc(temp47, temp48); |
---|
250 | #ifdef TEMPLATED_SIMD_LIB |
---|
251 | Control = simd_andc(simd<1>::constant<1>(), temp49); |
---|
252 | #endif |
---|
253 | #ifndef TEMPLATED_SIMD_LIB |
---|
254 | Control = simd_andc(simd_const_1(1), temp49); |
---|
255 | #endif |
---|
256 | BitBlock temp50 = simd_andc(bit[7], bit[6]); |
---|
257 | BitBlock temp51 = simd_and(temp15, temp50); |
---|
258 | BitBlock CR = simd_andc(temp51, temp3); |
---|
259 | BitBlock temp52 = simd_and(temp8, temp50); |
---|
260 | BitBlock LF = simd_and(temp14, temp52); |
---|
261 | BitBlock HT = simd_andc(temp52, temp3); |
---|
262 | BitBlock temp53 = simd_andc(bit[1], bit[0]); |
---|
263 | BitBlock temp54 = simd_andc(temp53, temp2); |
---|
264 | BitBlock temp55 = simd_or(bit[6], bit[7]); |
---|
265 | BitBlock temp56 = simd_or(temp4, temp55); |
---|
266 | BitBlock SP = simd_andc(temp54, temp56); |
---|
267 | WS = simd_or(simd_or(CR, LF), simd_or(HT, SP)); |
---|
268 | } |
---|
269 | |
---|
270 | |
---|
271 | |
---|
272 | template <CodeUnit_Base C> |
---|
273 | void Lexer<C>::Do_XML_10_WS_Control() { |
---|
274 | BitBlock Control, WS; |
---|
275 | for (int i = 0; i < buffer_blocks; i++) { |
---|
276 | WS_Control_Blocks<C>(x8basis[i].bit, |
---|
277 | WS, |
---|
278 | Control); |
---|
279 | parsing_engine_data->item_stream[NonWS][i] = simd_not(WS); |
---|
280 | validation_stream[i] = simd_andc(Control, WS); |
---|
281 | } |
---|
282 | }; |
---|
283 | |
---|
284 | |
---|
285 | |
---|
286 | template <CodeUnit_Base C> |
---|
287 | static inline void ComputeLexicalItemBlocks(BitBlock bit[], BitBlock LexItem[]); |
---|
288 | |
---|
289 | /* Given the bit[] array of one BitBlock each for the 8 bits of |
---|
290 | an ASCII-family character representation, compute the parallel |
---|
291 | lexical item streams needed for XML parsing. |
---|
292 | |
---|
293 | WARNING: the following is generated code by charset_compiler.py. |
---|
294 | Do not edit. |
---|
295 | |
---|
296 | */ |
---|
297 | |
---|
298 | template <> |
---|
299 | inline void ComputeLexicalItemBlocks<ASCII>(BitBlock bit[], BitBlock LexItem[]) { |
---|
300 | BitBlock temp1 = simd_or(bit[0], bit[1]); |
---|
301 | BitBlock temp2 = simd_and(bit[2], bit[3]); |
---|
302 | BitBlock temp3 = simd_andc(temp2, temp1); |
---|
303 | BitBlock temp4 = simd_and(bit[4], bit[5]); |
---|
304 | BitBlock temp5 = simd_or(bit[6], bit[7]); |
---|
305 | BitBlock temp6 = simd_andc(temp4, temp5); |
---|
306 | BitBlock temp7 = simd_and(temp3, temp6); |
---|
307 | BitBlock temp8 = simd_andc(bit[2], bit[3]); |
---|
308 | BitBlock temp9 = simd_andc(temp8, temp1); |
---|
309 | BitBlock temp10 = simd_andc(bit[5], bit[4]); |
---|
310 | BitBlock temp11 = simd_andc(bit[6], bit[7]); |
---|
311 | BitBlock temp12 = simd_and(temp10, temp11); |
---|
312 | BitBlock temp13 = simd_and(temp9, temp12); |
---|
313 | LexItem[MarkupStart] = simd_or(temp7, temp13); |
---|
314 | BitBlock temp14 = simd_and(temp4, temp11); |
---|
315 | BitBlock RAngle = simd_and(temp3, temp14); |
---|
316 | BitBlock temp15 = simd_andc(bit[1], bit[0]); |
---|
317 | BitBlock temp16 = simd_andc(bit[3], bit[2]); |
---|
318 | BitBlock temp17 = simd_and(temp15, temp16); |
---|
319 | BitBlock temp18 = simd_andc(bit[7], bit[6]); |
---|
320 | BitBlock temp19 = simd_and(temp4, temp18); |
---|
321 | BitBlock RBracket = simd_and(temp17, temp19); |
---|
322 | LexItem[Hyphen] = simd_and(temp9, temp19); |
---|
323 | BitBlock temp20 = simd_and(bit[6], bit[7]); |
---|
324 | BitBlock temp21 = simd_and(temp4, temp20); |
---|
325 | LexItem[QMark] = simd_and(temp3, temp21); |
---|
326 | BitBlock temp22 = simd_or(bit[4], bit[5]); |
---|
327 | BitBlock temp23 = simd_andc(temp11, temp22); |
---|
328 | BitBlock temp24 = simd_and(temp10, temp20); |
---|
329 | BitBlock temp25 = simd_or(temp23, temp24); |
---|
330 | BitBlock temp26 = simd_and(temp9, temp25); |
---|
331 | BitBlock temp27 = simd_or(temp26, temp7); |
---|
332 | LexItem[Quote] = simd_or(temp27, temp13); |
---|
333 | BitBlock temp28 = simd_and(bit[5], temp5); |
---|
334 | BitBlock temp29 = simd_and(bit[4], temp28); |
---|
335 | BitBlock temp30 = simd_andc(temp29, temp21); |
---|
336 | BitBlock temp31 = simd_andc(temp9, temp30); |
---|
337 | BitBlock temp32 = simd_and(temp3, bit[4]); |
---|
338 | BitBlock temp33 = simd_or(bit[5], temp20); |
---|
339 | BitBlock temp34 = simd_and(temp32, temp33); |
---|
340 | BitBlock temp35 = simd_or(temp31, temp34); |
---|
341 | BitBlock temp36 = simd_and(temp17, bit[4]); |
---|
342 | #ifdef TEMPLATED_SIMD_LIB |
---|
343 | BitBlock temp37 = simd_andc(simd<1>::constant<1>(), temp20); |
---|
344 | #endif |
---|
345 | #ifndef TEMPLATED_SIMD_LIB |
---|
346 | BitBlock temp37 = simd_andc(simd_const_1(1), temp20); |
---|
347 | #endif |
---|
348 | BitBlock temp38 = simd_if(bit[5], temp37, temp20); |
---|
349 | BitBlock temp39 = simd_and(temp36, temp38); |
---|
350 | BitBlock temp40 = simd_or(temp35, temp39); |
---|
351 | BitBlock temp41 = simd_and(temp15, temp2); |
---|
352 | BitBlock temp42 = simd_and(temp41, bit[4]); |
---|
353 | BitBlock temp43 = simd_and(temp42, temp38); |
---|
354 | LexItem[NameFollow] = simd_or(temp40, temp43); |
---|
355 | #ifdef DIGIT_AND_HEX_ITEMS |
---|
356 | BitBlock temp44 = simd_or(bit[5], bit[6]); |
---|
357 | BitBlock temp45 = simd_and(bit[4], temp44); |
---|
358 | BitBlock Digit = simd_andc(temp3, temp45); |
---|
359 | BitBlock temp46 = simd_or(bit[2], bit[3]); |
---|
360 | BitBlock temp47 = simd_andc(temp15, temp46); |
---|
361 | BitBlock temp48 = simd_andc(temp47, bit[4]); |
---|
362 | BitBlock temp49 = simd_if(bit[5], temp37, temp5); |
---|
363 | BitBlock temp50 = simd_and(temp48, temp49); |
---|
364 | BitBlock temp51 = simd_or(Digit, temp50); |
---|
365 | BitBlock temp52 = simd_and(temp15, temp8); |
---|
366 | BitBlock temp53 = simd_andc(temp52, bit[4]); |
---|
367 | BitBlock temp54 = simd_and(temp53, temp49); |
---|
368 | BitBlock Hex = simd_or(temp51, temp54); |
---|
369 | LexItem[NonDigit] = simd_not(Digit); |
---|
370 | LexItem[NonHex] = simd_not(Hex); |
---|
371 | #endif |
---|
372 | #ifdef MARKUP_SORTING |
---|
373 | BitBlock temp55 = simd_andc(temp20, temp22); |
---|
374 | BitBlock temp56 = simd_or(temp12, temp55); |
---|
375 | BitBlock temp57 = simd_or(temp56, temp21); |
---|
376 | LexItem[AmpHashSlash] = simd_and(temp9, temp57); |
---|
377 | #endif |
---|
378 | |
---|
379 | /* Mark potential occurrences of ']]>' These are all actual |
---|
380 | occurrences of ]]> as well as occurrences of ]] or ] at |
---|
381 | the block end. Shifting the RBracket and RAngle streams in |
---|
382 | negated forms ensures that a potential CD_End is not ruled |
---|
383 | out at the block boundary. */ |
---|
384 | LexItem[CD_End_check] = simd_andc(RBracket, |
---|
385 | simd_or(sisd_sbli(simd_not(RBracket), 1), |
---|
386 | sisd_sbli(simd_not(RAngle), 2))); |
---|
387 | #ifndef OMIT_CD_End_check_In_Markup_Scan |
---|
388 | LexItem[MarkupStart] = simd_or(LexItem[MarkupStart], LexItem[CD_End_check]); |
---|
389 | #endif |
---|
390 | } |
---|
391 | |
---|
392 | template <> |
---|
393 | inline void ComputeLexicalItemBlocks<EBCDIC>(BitBlock bit[], BitBlock LexItem[]) { |
---|
394 | BitBlock temp1 = simd_andc(bit[1], bit[0]); |
---|
395 | BitBlock temp2 = simd_or(bit[2], bit[3]); |
---|
396 | BitBlock temp3 = simd_andc(temp1, temp2); |
---|
397 | BitBlock temp4 = simd_and(bit[4], bit[5]); |
---|
398 | BitBlock temp5 = simd_or(bit[6], bit[7]); |
---|
399 | BitBlock temp6 = simd_andc(temp4, temp5); |
---|
400 | BitBlock temp7 = simd_and(temp3, temp6); |
---|
401 | BitBlock temp8 = simd_andc(bit[3], bit[2]); |
---|
402 | BitBlock temp9 = simd_and(temp1, temp8); |
---|
403 | BitBlock temp10 = simd_or(bit[4], bit[5]); |
---|
404 | BitBlock temp11 = simd_or(temp10, temp5); |
---|
405 | BitBlock temp12 = simd_andc(temp9, temp11); |
---|
406 | LexItem[MarkupStart] = simd_or(temp7, temp12); |
---|
407 | BitBlock temp13 = simd_andc(bit[2], bit[3]); |
---|
408 | BitBlock temp14 = simd_and(temp1, temp13); |
---|
409 | BitBlock temp15 = simd_andc(bit[6], bit[7]); |
---|
410 | BitBlock temp16 = simd_and(temp4, temp15); |
---|
411 | BitBlock RAngle = simd_and(temp14, temp16); |
---|
412 | BitBlock temp17 = simd_andc(bit[0], bit[1]); |
---|
413 | BitBlock temp18 = simd_and(bit[2], bit[3]); |
---|
414 | BitBlock temp19 = simd_and(temp17, temp18); |
---|
415 | BitBlock temp20 = simd_andc(bit[4], bit[5]); |
---|
416 | BitBlock temp21 = simd_and(bit[6], bit[7]); |
---|
417 | BitBlock temp22 = simd_and(temp20, temp21); |
---|
418 | BitBlock RBracket = simd_and(temp19, temp22); |
---|
419 | LexItem[Hyphen] = simd_andc(temp14, temp11); |
---|
420 | BitBlock temp23 = simd_and(temp4, temp21); |
---|
421 | BitBlock QMark = simd_and(temp14, temp23); |
---|
422 | BitBlock temp24 = simd_and(temp1, temp18); |
---|
423 | BitBlock temp25 = simd_and(temp4, bit[7]); |
---|
424 | BitBlock temp26 = simd_and(temp24, temp25); |
---|
425 | BitBlock temp27 = simd_or(temp26, temp7); |
---|
426 | LexItem[Quote] = simd_or(temp27, temp12); |
---|
427 | BitBlock temp28 = simd_andc(temp3, temp11); |
---|
428 | BitBlock temp29 = simd_and(temp20, temp15); |
---|
429 | BitBlock temp30 = simd_and(temp9, temp29); |
---|
430 | BitBlock temp31 = simd_or(temp28, temp30); |
---|
431 | BitBlock temp32 = simd_and(temp24, temp23); |
---|
432 | BitBlock temp33 = simd_or(temp31, temp32); |
---|
433 | BitBlock temp34 = simd_and(temp24, temp22); |
---|
434 | BitBlock temp35 = simd_or(temp33, temp34); |
---|
435 | BitBlock temp36 = simd_and(temp9, temp22); |
---|
436 | BitBlock temp37 = simd_or(temp35, temp36); |
---|
437 | BitBlock temp38 = simd_and(temp14, temp6); |
---|
438 | BitBlock temp39 = simd_or(temp37, temp38); |
---|
439 | BitBlock temp40 = simd_or(temp39, temp12); |
---|
440 | BitBlock temp41 = simd_andc(bit[7], bit[6]); |
---|
441 | BitBlock temp42 = simd_and(temp4, temp41); |
---|
442 | BitBlock temp43 = simd_and(temp24, temp42); |
---|
443 | BitBlock temp44 = simd_or(temp40, temp43); |
---|
444 | BitBlock temp45 = simd_and(temp3, temp42); |
---|
445 | BitBlock temp46 = simd_or(temp44, temp45); |
---|
446 | BitBlock temp47 = simd_and(temp9, temp42); |
---|
447 | BitBlock temp48 = simd_or(temp46, temp47); |
---|
448 | BitBlock temp49 = simd_and(temp9, temp6); |
---|
449 | BitBlock temp50 = simd_or(temp48, temp49); |
---|
450 | BitBlock temp51 = simd_and(temp3, temp16); |
---|
451 | BitBlock temp52 = simd_or(temp50, temp51); |
---|
452 | BitBlock temp53 = simd_and(temp14, temp22); |
---|
453 | BitBlock temp54 = simd_or(temp52, temp53); |
---|
454 | BitBlock temp55 = simd_andc(temp41, temp10); |
---|
455 | BitBlock temp56 = simd_and(temp14, temp55); |
---|
456 | BitBlock temp57 = simd_or(temp54, temp56); |
---|
457 | BitBlock temp58 = simd_and(temp9, temp16); |
---|
458 | BitBlock temp59 = simd_or(temp57, temp58); |
---|
459 | BitBlock temp60 = simd_or(temp59, temp7); |
---|
460 | BitBlock temp61 = simd_and(temp24, temp16); |
---|
461 | BitBlock temp62 = simd_or(temp60, temp61); |
---|
462 | BitBlock temp63 = simd_or(temp62, RAngle); |
---|
463 | BitBlock temp64 = simd_or(temp63, QMark); |
---|
464 | BitBlock temp65 = simd_and(temp19, temp29); |
---|
465 | BitBlock temp66 = simd_or(temp64, temp65); |
---|
466 | BitBlock temp67 = simd_and(bit[0], bit[1]); |
---|
467 | BitBlock temp68 = simd_and(temp67, temp13); |
---|
468 | BitBlock temp69 = simd_andc(temp68, temp11); |
---|
469 | BitBlock temp70 = simd_or(temp66, temp69); |
---|
470 | BitBlock temp71 = simd_or(temp70, RBracket); |
---|
471 | BitBlock temp72 = simd_andc(temp19, temp11); |
---|
472 | BitBlock temp73 = simd_or(temp71, temp72); |
---|
473 | BitBlock temp74 = simd_andc(temp67, temp2); |
---|
474 | BitBlock temp75 = simd_andc(temp74, temp11); |
---|
475 | BitBlock temp76 = simd_or(temp73, temp75); |
---|
476 | BitBlock temp77 = simd_and(temp3, temp23); |
---|
477 | BitBlock temp78 = simd_or(temp76, temp77); |
---|
478 | BitBlock temp79 = simd_and(temp67, temp8); |
---|
479 | BitBlock temp80 = simd_andc(temp79, temp11); |
---|
480 | BitBlock temp81 = simd_or(temp78, temp80); |
---|
481 | BitBlock temp82 = simd_and(temp17, temp13); |
---|
482 | BitBlock temp83 = simd_and(temp82, temp55); |
---|
483 | LexItem[NameFollow] = simd_or(temp81, temp83); |
---|
484 | #ifdef DIGIT_AND_HEX_ITEMS |
---|
485 | BitBlock temp84 = simd_and(temp67, temp18); |
---|
486 | BitBlock temp85 = simd_andc(temp11, temp55); |
---|
487 | BitBlock temp86 = simd_andc(temp15, temp10); |
---|
488 | BitBlock temp87 = simd_andc(temp85, temp86); |
---|
489 | BitBlock temp88 = simd_andc(temp21, temp10); |
---|
490 | BitBlock temp89 = simd_andc(temp87, temp88); |
---|
491 | BitBlock temp90 = simd_andc(bit[5], bit[4]); |
---|
492 | BitBlock temp91 = simd_andc(temp90, temp5); |
---|
493 | BitBlock temp92 = simd_andc(temp89, temp91); |
---|
494 | BitBlock temp93 = simd_and(temp90, temp41); |
---|
495 | BitBlock temp94 = simd_andc(temp92, temp93); |
---|
496 | BitBlock temp95 = simd_and(temp90, temp15); |
---|
497 | BitBlock temp96 = simd_andc(temp94, temp95); |
---|
498 | BitBlock temp97 = simd_and(temp90, temp21); |
---|
499 | BitBlock temp98 = simd_andc(temp96, temp97); |
---|
500 | BitBlock temp99 = simd_andc(temp20, temp5); |
---|
501 | BitBlock temp100 = simd_andc(temp98, temp99); |
---|
502 | BitBlock temp101 = simd_and(temp20, temp41); |
---|
503 | BitBlock temp102 = simd_andc(temp100, temp101); |
---|
504 | BitBlock Digit = simd_andc(temp84, temp102); |
---|
505 | BitBlock temp103 = simd_and(temp74, temp55); |
---|
506 | BitBlock temp104 = simd_or(Digit, temp103); |
---|
507 | BitBlock temp105 = simd_and(temp74, temp86); |
---|
508 | BitBlock temp106 = simd_or(temp104, temp105); |
---|
509 | BitBlock temp107 = simd_and(temp74, temp88); |
---|
510 | BitBlock temp108 = simd_or(temp106, temp107); |
---|
511 | BitBlock temp109 = simd_and(temp74, temp91); |
---|
512 | BitBlock temp110 = simd_or(temp108, temp109); |
---|
513 | BitBlock temp111 = simd_and(temp74, temp93); |
---|
514 | BitBlock temp112 = simd_or(temp110, temp111); |
---|
515 | BitBlock temp113 = simd_and(temp74, temp95); |
---|
516 | BitBlock temp114 = simd_or(temp112, temp113); |
---|
517 | BitBlock temp115 = simd_andc(temp17, temp2); |
---|
518 | BitBlock temp116 = simd_and(temp115, temp55); |
---|
519 | BitBlock temp117 = simd_or(temp114, temp116); |
---|
520 | BitBlock temp118 = simd_and(temp115, temp86); |
---|
521 | BitBlock temp119 = simd_or(temp117, temp118); |
---|
522 | BitBlock temp120 = simd_and(temp115, temp88); |
---|
523 | BitBlock temp121 = simd_or(temp119, temp120); |
---|
524 | BitBlock temp122 = simd_and(temp115, temp91); |
---|
525 | BitBlock temp123 = simd_or(temp121, temp122); |
---|
526 | BitBlock temp124 = simd_and(temp115, temp93); |
---|
527 | BitBlock temp125 = simd_or(temp123, temp124); |
---|
528 | BitBlock temp126 = simd_and(temp115, temp95); |
---|
529 | BitBlock Hex = simd_or(temp125, temp126); |
---|
530 | |
---|
531 | LexItem[NonDigit] = simd_not(Digit); |
---|
532 | LexItem[NonHex] = simd_not(Hex); |
---|
533 | |
---|
534 | #endif |
---|
535 | |
---|
536 | /* Mark potential occurrences of ']]>' These are all actual |
---|
537 | occurrences of ]]> as well as occurrences of ]] or ] at |
---|
538 | the block end. Shifting the RBracket and RAngle streams in |
---|
539 | negated forms ensures that a potential CD_End is not ruled |
---|
540 | out at the block boundary. */ |
---|
541 | LexItem[CD_End_check] = simd_andc(RBracket, |
---|
542 | simd_or(sisd_sbli(simd_not(RBracket), 1), |
---|
543 | sisd_sbli(simd_not(RAngle), 2))); |
---|
544 | #ifndef OMIT_CD_End_check_In_Markup_Scan |
---|
545 | LexItem[MarkupStart] = simd_or(LexItem[MarkupStart], LexItem[CD_End_check]); |
---|
546 | #endif |
---|
547 | } |
---|
548 | |
---|
549 | |
---|
550 | /* A temporary structure for internal use in ComputeLexicalItemStreams. */ |
---|
551 | typedef struct { |
---|
552 | BitBlock LexicalItems[LexicalItemCount]; |
---|
553 | } LexicalItemBlock; |
---|
554 | |
---|
555 | |
---|
556 | |
---|
557 | |
---|
558 | template <CodeUnit_Base C> |
---|
559 | void Lexer<C>::Do_MarkupStreams() { |
---|
560 | LexicalItemBlock lx_blk[BUFFER_BLOCKS]; |
---|
561 | for (int i = 0; i < buffer_blocks; i++) { |
---|
562 | ComputeLexicalItemBlocks<C>(x8basis[i].bit, lx_blk[i].LexicalItems); |
---|
563 | } |
---|
564 | /* NonWS stream already completed by WS_Control method. */ |
---|
565 | for (int j = MarkupStart; j < LexicalItemCount; j++) { |
---|
566 | for (int i = 0; i < buffer_blocks; i++) { |
---|
567 | parsing_engine_data->item_stream[j][i] = lx_blk[i].LexicalItems[j]; |
---|
568 | } |
---|
569 | } |
---|
570 | for (int i = 0; i < buffer_blocks; i++) { |
---|
571 | parsing_engine_data->item_stream[NameFollow][i] = |
---|
572 | simd_or(parsing_engine_data->item_stream[NameFollow][i], |
---|
573 | simd_not(parsing_engine_data->item_stream[NonWS][i])); |
---|
574 | } |
---|
575 | }; |
---|
576 | |
---|
577 | |
---|
578 | void UTF_8_Lexer::Do_CharsetValidation() { |
---|
579 | BitBlock u8prefix, u8suffix, u8prefix2, u8prefix3or4, u8prefix3, u8prefix4; |
---|
580 | BitBlock error_mask; |
---|
581 | /* UTF-8 sequences may cross block boundaries. If a |
---|
582 | prefix is found near the end of a block that requires |
---|
583 | one or more suffixes in the next block, then |
---|
584 | prefix_pending is set to mark the positions. |
---|
585 | However, at the beginning of the buffer, no suffixes |
---|
586 | are expected, so this value is initialized to zeroes. */ |
---|
587 | #ifdef TEMPLATED_SIMD_LIB |
---|
588 | BitBlock prefix_pending = simd<1>::constant<0>(); |
---|
589 | /* If a suffix is pending, then it may involve one of |
---|
590 | the special case prefixes E0, ED. F0, F4, or the |
---|
591 | EF prefix or EF_BF combination for FFFF/FFFE detection.*/ |
---|
592 | BitBlock E0ED_pending = simd<1>::constant<0>(); |
---|
593 | BitBlock F0F4_pending = simd<1>::constant<0>(); |
---|
594 | BitBlock bit5_pending = simd<1>::constant<0>(); |
---|
595 | BitBlock EF_pending = simd<1>::constant<0>(); |
---|
596 | BitBlock EF_BF_pending = simd<1>::constant<0>(); |
---|
597 | #endif |
---|
598 | #ifndef TEMPLATED_SIMD_LIB |
---|
599 | BitBlock prefix_pending = simd_const_1(0); |
---|
600 | /* If a suffix is pending, then it may involve one of |
---|
601 | the special case prefixes E0, ED. F0, F4, or the |
---|
602 | EF prefix or EF_BF combination for FFFF/FFFE detection.*/ |
---|
603 | BitBlock E0ED_pending = simd_const_1(0); |
---|
604 | BitBlock F0F4_pending = simd_const_1(0); |
---|
605 | BitBlock bit5_pending = simd_const_1(0); |
---|
606 | BitBlock EF_pending = simd_const_1(0); |
---|
607 | BitBlock EF_BF_pending = simd_const_1(0); |
---|
608 | #endif |
---|
609 | /* Temporary variables used within the block. */ |
---|
610 | BitBlock suffix_required_scope; |
---|
611 | BitBlock prefix_E0ED, E0ED_scope, bit5_scope, E0ED_constraint; |
---|
612 | BitBlock prefix_F5FF, prefix_F0F4, F0F4_scope, F0F4_constraint; |
---|
613 | BitBlock X111x, B111x, prefix_EF, BF, EF_BF, EF_scope, EF_BF_scope; |
---|
614 | |
---|
615 | for (int i = 0; i < buffer_blocks; i++) { |
---|
616 | #ifdef TEMPLATED_SIMD_LIB |
---|
617 | validation_stream[i] = simd<1>::constant<0>(); |
---|
618 | #endif |
---|
619 | #ifndef TEMPLATED_SIMD_LIB |
---|
620 | validation_stream[i] = simd_const_1(0); |
---|
621 | #endif |
---|
622 | /* If there is no pending suffix and no bit 0, then there |
---|
623 | are no possible validation issues for this block. */ |
---|
624 | if (!bitblock_has_bit(simd_or(prefix_pending, x8basis[i].bit[0]))) |
---|
625 | continue; |
---|
626 | /* Compute classifications of UTF-8 bytes. */ |
---|
627 | u8prefix = simd_and(x8basis[i].bit[0], x8basis[i].bit[1]); |
---|
628 | u8suffix = simd_andc(x8basis[i].bit[0], x8basis[i].bit[1]); |
---|
629 | u8prefix3or4 = simd_and(u8prefix, x8basis[i].bit[2]); |
---|
630 | u8prefix2 = simd_andc(u8prefix, x8basis[i].bit[2]); |
---|
631 | u8prefix3 = simd_andc(u8prefix3or4, x8basis[i].bit[3]); |
---|
632 | u8prefix4 = simd_and(u8prefix3or4, x8basis[i].bit[3]); |
---|
633 | |
---|
634 | /* Initiate validation for two-byte sequences. */ |
---|
635 | error_mask = simd_andc(u8prefix2, |
---|
636 | simd_or(simd_or(x8basis[i].bit[3], x8basis[i].bit[4]), |
---|
637 | simd_or(x8basis[i].bit[5], x8basis[i].bit[6]))); |
---|
638 | suffix_required_scope = simd_or(prefix_pending, sisd_sfli(u8prefix, 1)); |
---|
639 | |
---|
640 | prefix_pending = sisd_sbli(u8prefix, BLOCKSIZE - 1); |
---|
641 | E0ED_scope = E0ED_pending; |
---|
642 | F0F4_scope = F0F4_pending; |
---|
643 | bit5_scope = bit5_pending; |
---|
644 | EF_scope = EF_pending; |
---|
645 | EF_BF_scope = EF_BF_pending; |
---|
646 | |
---|
647 | /* Default values of pending variables for next iteration. */ |
---|
648 | #ifdef TEMPLATED_SIMD_LIB |
---|
649 | E0ED_pending = simd<1>::constant<0>(); |
---|
650 | F0F4_pending = simd<1>::constant<0>(); |
---|
651 | bit5_pending = simd<1>::constant<0>(); |
---|
652 | EF_pending = simd<1>::constant<0>(); |
---|
653 | EF_BF_pending = simd<1>::constant<0>(); |
---|
654 | #endif |
---|
655 | #ifndef TEMPLATED_SIMD_LIB |
---|
656 | E0ED_pending = simd_const_1(0); |
---|
657 | F0F4_pending = simd_const_1(0); |
---|
658 | bit5_pending = simd_const_1(0); |
---|
659 | EF_pending = simd_const_1(0); |
---|
660 | EF_BF_pending = simd_const_1(0); |
---|
661 | #endif |
---|
662 | |
---|
663 | X111x = simd_and(simd_and(x8basis[i].bit[4], x8basis[i].bit[5]), x8basis[i].bit[6]); |
---|
664 | B111x = simd_and(simd_and(u8suffix, simd_and(x8basis[i].bit[2], x8basis[i].bit[3])), |
---|
665 | X111x); |
---|
666 | BF = simd_and(B111x, x8basis[i].bit[7]); |
---|
667 | EF_BF = simd_and(EF_scope, BF); |
---|
668 | |
---|
669 | if (bitblock_has_bit(u8prefix3or4)) { |
---|
670 | /* Extend validation for errors in three-byte sequences. */ |
---|
671 | suffix_required_scope = simd_or(suffix_required_scope, |
---|
672 | sisd_sfli(u8prefix3or4, 2)); |
---|
673 | bit5_scope = simd_or(bit5_scope, sisd_sfli(x8basis[i].bit[5], 1)); |
---|
674 | prefix_E0ED = simd_andc(u8prefix3, |
---|
675 | simd_or(simd_or(x8basis[i].bit[6], |
---|
676 | simd_xor(x8basis[i].bit[4], x8basis[i].bit[7])), |
---|
677 | simd_xor(x8basis[i].bit[4], x8basis[i].bit[5]))); |
---|
678 | E0ED_scope = simd_or(E0ED_scope, sisd_sfli(prefix_E0ED, 1)); |
---|
679 | prefix_EF = simd_and(u8prefix3, simd_and(X111x, x8basis[i].bit[7])); |
---|
680 | EF_scope = simd_or(EF_scope, sisd_sfli(prefix_EF, 1)); |
---|
681 | EF_BF = simd_and(EF_scope, BF); |
---|
682 | |
---|
683 | /* Values for next iteration. */ |
---|
684 | prefix_pending = simd_or(prefix_pending, |
---|
685 | sisd_sbli(u8prefix3or4, BLOCKSIZE - 2)); |
---|
686 | bit5_pending = sisd_sbli(x8basis[i].bit[5], BLOCKSIZE - 1); |
---|
687 | E0ED_pending = sisd_sbli(prefix_E0ED, BLOCKSIZE - 1); |
---|
688 | EF_pending = sisd_sbli(prefix_EF, BLOCKSIZE - 1); |
---|
689 | EF_BF_pending = sisd_sbli(EF_BF, BLOCKSIZE - 2); |
---|
690 | if (bitblock_has_bit(u8prefix4)) { |
---|
691 | /* Extend validation for errors in four-byte sequences. */ |
---|
692 | suffix_required_scope = simd_or(suffix_required_scope, |
---|
693 | sisd_sfli(u8prefix4, 3)); |
---|
694 | prefix_pending = simd_or(prefix_pending, |
---|
695 | sisd_sbli(u8prefix4, BLOCKSIZE - 3)); |
---|
696 | prefix_F5FF = simd_and(u8prefix4, |
---|
697 | simd_or(x8basis[i].bit[4], |
---|
698 | simd_and(x8basis[i].bit[5], |
---|
699 | simd_or(x8basis[i].bit[6], x8basis[i].bit[7])))); |
---|
700 | error_mask = simd_or(error_mask, prefix_F5FF); |
---|
701 | prefix_F0F4 = simd_andc(u8prefix4, |
---|
702 | simd_or(x8basis[i].bit[4], |
---|
703 | simd_or(x8basis[i].bit[6], x8basis[i].bit[7]))); |
---|
704 | F0F4_scope = simd_or(F0F4_scope, sisd_sfli(prefix_F0F4, 1)); |
---|
705 | F0F4_pending = sisd_sbli(prefix_F0F4, BLOCKSIZE - 1); |
---|
706 | } |
---|
707 | } |
---|
708 | E0ED_constraint = simd_xor(bit5_scope, x8basis[i].bit[2]); |
---|
709 | error_mask = simd_or(error_mask, simd_andc(E0ED_scope, E0ED_constraint)); |
---|
710 | F0F4_constraint = simd_xor(bit5_scope, |
---|
711 | simd_or(x8basis[i].bit[2], x8basis[i].bit[3])); |
---|
712 | error_mask = simd_or(error_mask, simd_andc(F0F4_scope, F0F4_constraint)); |
---|
713 | /* Complete validation by checking for prefix-suffix mismatches. */ |
---|
714 | error_mask = simd_or(error_mask, simd_xor(suffix_required_scope, u8suffix)); |
---|
715 | |
---|
716 | EF_BF_scope = simd_or(EF_BF_scope, sisd_sfli(EF_BF, 1)); |
---|
717 | error_mask = simd_or(error_mask, simd_and(EF_BF_scope, B111x)); |
---|
718 | validation_stream[i] = error_mask; |
---|
719 | #ifdef DEBUG_UTF8_VALIDATION |
---|
720 | // if (bitblock_has_bit(error_mask)) { |
---|
721 | printf("-%i----------------------\n", i); |
---|
722 | print_bit_block("x8basis[i].bit[0]", x8basis[i].bit[0]); |
---|
723 | print_bit_block("x8basis[i].bit[1]", x8basis[i].bit[1]); |
---|
724 | print_bit_block("x8basis[i].bit[2]", x8basis[i].bit[2]); |
---|
725 | print_bit_block("x8basis[i].bit[3]", x8basis[i].bit[3]); |
---|
726 | print_bit_block("u8prefix2", u8prefix2); |
---|
727 | print_bit_block("u8prefix3", u8prefix3); |
---|
728 | print_bit_block("u8prefix4", u8prefix4); |
---|
729 | print_bit_block("suffix_required_scope", suffix_required_scope); |
---|
730 | print_bit_block("prefix_pending", prefix_pending); |
---|
731 | print_bit_block("E0ED_pending", E0ED_pending); |
---|
732 | print_bit_block("F0F4_pending", F0F4_pending); |
---|
733 | print_bit_block("bit5_pending", bit5_pending); |
---|
734 | print_bit_block("error_mask", error_mask); |
---|
735 | |
---|
736 | //} |
---|
737 | #endif |
---|
738 | } |
---|
739 | }; |
---|
740 | |
---|
741 | |
---|
742 | void ASCII_7_Lexer::Do_CharsetValidation() { |
---|
743 | for (int blk = 0; blk < buffer_blocks; blk++) { |
---|
744 | validation_stream[blk] = x8basis[blk].bit[0]; |
---|
745 | } |
---|
746 | }; |
---|
747 | |
---|
748 | |
---|
749 | void EASCII_8_Lexer::Do_CharsetValidation() { |
---|
750 | /* Nothing required for most charsets - but perhaps should have tables. */ |
---|
751 | for (int i = 0; i < buffer_blocks; i++) { |
---|
752 | #ifdef TEMPLATED_SIMD_LIB |
---|
753 | validation_stream[i] = simd<1>::constant<0>(); |
---|
754 | #endif |
---|
755 | #ifndef TEMPLATED_SIMD_LIB |
---|
756 | validation_stream[i] = simd_const_1(0); |
---|
757 | #endif |
---|
758 | } |
---|
759 | }; |
---|
760 | |
---|
761 | |
---|
762 | void UTF_16_Lexer::Do_CharsetValidation() { |
---|
763 | }; |
---|
764 | |
---|
765 | |
---|
766 | void UCS_2_Lexer::Do_CharsetValidation() { |
---|
767 | }; |
---|
768 | |
---|
769 | |
---|
770 | void UTF_32_Lexer::Do_CharsetValidation() { |
---|
771 | }; |
---|
772 | |
---|
773 | |
---|
774 | void EBCDIC_Lexer::Do_CharsetValidation() { |
---|
775 | /* Nothing required for most cases - but perhaps should have tables. */ |
---|
776 | for (int i = 0; i < buffer_blocks; i++) { |
---|
777 | #ifdef TEMPLATED_SIMD_LIB |
---|
778 | validation_stream[i] = simd<1>::constant<0>(); |
---|
779 | #endif |
---|
780 | #ifndef TEMPLATED_SIMD_LIB |
---|
781 | validation_stream[i] = simd_const_1(0); |
---|
782 | #endif |
---|
783 | } |
---|
784 | } |
---|
785 | |
---|
786 | |
---|
787 | |
---|
788 | /* Stub out XML 1.1 routines initially. */ |
---|
789 | |
---|
790 | void UTF_8_Lexer::Do_XML_11_WS_Control() { |
---|
791 | printf("UTF_8_Lexer::Do_XML_11_WS_Control not yet implemented; using XML 1.0 rules.\n"); |
---|
792 | Do_XML_10_WS_Control(); |
---|
793 | }; |
---|
794 | |
---|
795 | |
---|
796 | static inline void ASCII_7_WS_Control_Blocks_11(BitBlock bit[], BitBlock& WS, BitBlock& Control) { |
---|
797 | BitBlock temp1 = simd_or(bit[0], bit[1]); |
---|
798 | BitBlock temp2 = simd_or(temp1, bit[2]); |
---|
799 | BitBlock temp3 = simd_andc(bit[1], bit[0]); |
---|
800 | BitBlock temp4 = simd_and(bit[2], bit[3]); |
---|
801 | BitBlock temp5 = simd_and(temp3, temp4); |
---|
802 | BitBlock temp6 = simd_and(bit[4], bit[5]); |
---|
803 | BitBlock temp7 = simd_and(bit[6], bit[7]); |
---|
804 | BitBlock temp8 = simd_and(temp6, temp7); |
---|
805 | BitBlock temp9 = simd_and(temp5, temp8); |
---|
806 | BitBlock temp10 = simd_andc(temp2, temp9); |
---|
807 | BitBlock temp11 = simd_andc(temp10, bit[0]); |
---|
808 | #ifdef TEMPLATED_SIMD_LIB |
---|
809 | Control = simd_andc(simd<1>::constant<1>(), temp11); |
---|
810 | #endif |
---|
811 | #ifndef TEMPLATED_SIMD_LIB |
---|
812 | Control = simd_andc(simd_const_1(1), temp11); |
---|
813 | #endif |
---|
814 | BitBlock temp12 = simd_or(bit[2], bit[3]); |
---|
815 | BitBlock temp13 = simd_or(temp1, temp12); |
---|
816 | BitBlock temp14 = simd_andc(bit[7], bit[6]); |
---|
817 | BitBlock temp15 = simd_and(temp6, temp14); |
---|
818 | BitBlock CR = simd_andc(temp15, temp13); |
---|
819 | BitBlock temp16 = simd_andc(bit[4], bit[5]); |
---|
820 | BitBlock temp17 = simd_andc(bit[6], bit[7]); |
---|
821 | BitBlock temp18 = simd_and(temp16, temp17); |
---|
822 | BitBlock LF = simd_andc(temp18, temp13); |
---|
823 | BitBlock temp19 = simd_and(temp16, temp14); |
---|
824 | BitBlock HT = simd_andc(temp19, temp13); |
---|
825 | BitBlock temp20 = simd_andc(bit[2], bit[3]); |
---|
826 | BitBlock temp21 = simd_andc(temp20, temp1); |
---|
827 | BitBlock temp22 = simd_or(bit[4], bit[5]); |
---|
828 | BitBlock temp23 = simd_or(bit[6], bit[7]); |
---|
829 | BitBlock temp24 = simd_or(temp22, temp23); |
---|
830 | BitBlock SP = simd_andc(temp21, temp24); |
---|
831 | WS = simd_or(simd_or(CR, LF), simd_or(HT, SP)); |
---|
832 | } |
---|
833 | |
---|
834 | |
---|
835 | void ASCII_7_Lexer::Do_XML_11_WS_Control() { |
---|
836 | BitBlock WS, Control; |
---|
837 | for (int i = 0; i < buffer_blocks; i++) { |
---|
838 | ASCII_7_WS_Control_Blocks_11(x8basis[i].bit, WS, Control); |
---|
839 | parsing_engine_data->item_stream[NonWS][i] = simd_not(WS); |
---|
840 | validation_stream[i] = simd_andc(Control, WS); |
---|
841 | } |
---|
842 | }; |
---|
843 | |
---|
844 | static inline void EASCII_8_WS_Control_Blocks_11(BitBlock bit[], BitBlock& WS, BitBlock& Control) { |
---|
845 | BitBlock temp1 = simd_or(bit[0], bit[1]); |
---|
846 | BitBlock temp2 = simd_or(temp1, bit[2]); |
---|
847 | BitBlock temp3 = simd_andc(bit[1], bit[0]); |
---|
848 | BitBlock temp4 = simd_and(bit[2], bit[3]); |
---|
849 | BitBlock temp5 = simd_and(temp3, temp4); |
---|
850 | BitBlock temp6 = simd_and(bit[4], bit[5]); |
---|
851 | BitBlock temp7 = simd_and(bit[6], bit[7]); |
---|
852 | BitBlock temp8 = simd_and(temp6, temp7); |
---|
853 | BitBlock temp9 = simd_and(temp5, temp8); |
---|
854 | BitBlock temp10 = simd_andc(temp2, temp9); |
---|
855 | BitBlock temp11 = simd_andc(bit[0], bit[1]); |
---|
856 | BitBlock temp12 = simd_andc(temp11, bit[2]); |
---|
857 | BitBlock temp13 = simd_andc(temp10, temp12); |
---|
858 | #ifdef TEMPLATED_SIMD_LIB |
---|
859 | Control = simd_andc(simd<1>::constant<1>(), temp13); |
---|
860 | #endif |
---|
861 | #ifndef TEMPLATED_SIMD_LIB |
---|
862 | Control = simd_andc(simd_const_1(1), temp13); |
---|
863 | #endif |
---|
864 | BitBlock temp14 = simd_or(bit[2], bit[3]); |
---|
865 | BitBlock temp15 = simd_or(temp1, temp14); |
---|
866 | BitBlock temp16 = simd_andc(bit[7], bit[6]); |
---|
867 | BitBlock temp17 = simd_and(temp6, temp16); |
---|
868 | BitBlock CR = simd_andc(temp17, temp15); |
---|
869 | BitBlock temp18 = simd_andc(bit[4], bit[5]); |
---|
870 | BitBlock temp19 = simd_andc(bit[6], bit[7]); |
---|
871 | BitBlock temp20 = simd_and(temp18, temp19); |
---|
872 | BitBlock LF = simd_andc(temp20, temp15); |
---|
873 | BitBlock temp21 = simd_and(temp18, temp16); |
---|
874 | BitBlock HT = simd_andc(temp21, temp15); |
---|
875 | BitBlock temp22 = simd_andc(bit[2], bit[3]); |
---|
876 | BitBlock temp23 = simd_andc(temp22, temp1); |
---|
877 | BitBlock temp24 = simd_or(bit[4], bit[5]); |
---|
878 | BitBlock temp25 = simd_or(bit[6], bit[7]); |
---|
879 | BitBlock temp26 = simd_or(temp24, temp25); |
---|
880 | BitBlock SP = simd_andc(temp23, temp26); |
---|
881 | BitBlock temp27 = simd_andc(temp11, temp14); |
---|
882 | BitBlock temp28 = simd_andc(bit[5], bit[4]); |
---|
883 | BitBlock temp29 = simd_and(temp28, temp16); |
---|
884 | BitBlock NEL = simd_and(temp27, temp29); |
---|
885 | WS = simd_or(simd_or(simd_or(CR, LF), simd_or(HT, SP)), NEL); |
---|
886 | } |
---|
887 | |
---|
888 | void EASCII_8_Lexer::Do_XML_11_WS_Control() { |
---|
889 | BitBlock WS, Control; |
---|
890 | for (int i = 0; i < buffer_blocks; i++) { |
---|
891 | EASCII_8_WS_Control_Blocks_11(x8basis[i].bit, WS, Control); |
---|
892 | parsing_engine_data->item_stream[NonWS][i] = simd_not(WS); |
---|
893 | validation_stream[i] = simd_andc(Control, WS); |
---|
894 | } |
---|
895 | }; |
---|
896 | |
---|
897 | |
---|
898 | void U16_Lexer::Do_XML_11_WS_Control() { |
---|
899 | printf("U16_Lexer::Do_XML_11_WS_Control not yet implemented; using XML 1.0 rules.\n"); |
---|
900 | Do_XML_10_WS_Control(); |
---|
901 | }; |
---|
902 | |
---|
903 | |
---|
904 | void UTF_32_Lexer::Do_XML_11_WS_Control() { |
---|
905 | printf("UTF_32_Lexer::Do_XML_11_WS_Control not yet implemented; using XML 1.0 rules.\n"); |
---|
906 | Do_XML_10_WS_Control(); |
---|
907 | }; |
---|
908 | |
---|
909 | static inline void EBCDIC_WS_Control_Blocks_11(BitBlock bit[], BitBlock& WS, BitBlock& Control) { |
---|
910 | BitBlock temp1 = simd_or(bit[0], bit[1]); |
---|
911 | BitBlock temp2 = simd_and(bit[0], bit[1]); |
---|
912 | BitBlock temp3 = simd_and(bit[2], bit[3]); |
---|
913 | BitBlock temp4 = simd_and(temp2, temp3); |
---|
914 | BitBlock temp5 = simd_and(bit[4], bit[5]); |
---|
915 | BitBlock temp6 = simd_and(bit[6], bit[7]); |
---|
916 | BitBlock temp7 = simd_and(temp5, temp6); |
---|
917 | BitBlock temp8 = simd_and(temp4, temp7); |
---|
918 | BitBlock temp9 = simd_andc(temp1, temp8); |
---|
919 | #ifdef TEMPLATED_SIMD_LIB |
---|
920 | Control = simd_andc(simd<1>::constant<1>(), temp9); |
---|
921 | #endif |
---|
922 | #ifndef TEMPLATED_SIMD_LIB |
---|
923 | Control = simd_andc(simd_const_1(1), temp9); |
---|
924 | #endif |
---|
925 | BitBlock temp10 = simd_or(bit[2], bit[3]); |
---|
926 | BitBlock temp11 = simd_or(temp1, temp10); |
---|
927 | BitBlock temp12 = simd_andc(bit[7], bit[6]); |
---|
928 | BitBlock temp13 = simd_and(temp5, temp12); |
---|
929 | BitBlock CR = simd_andc(temp13, temp11); |
---|
930 | BitBlock temp14 = simd_andc(bit[2], bit[3]); |
---|
931 | BitBlock temp15 = simd_andc(temp14, temp1); |
---|
932 | BitBlock temp16 = simd_andc(bit[5], bit[4]); |
---|
933 | BitBlock temp17 = simd_and(temp16, temp12); |
---|
934 | BitBlock LF = simd_and(temp15, temp17); |
---|
935 | BitBlock HT = simd_andc(temp17, temp11); |
---|
936 | BitBlock temp18 = simd_andc(bit[1], bit[0]); |
---|
937 | BitBlock temp19 = simd_andc(temp18, temp10); |
---|
938 | BitBlock temp20 = simd_or(bit[4], bit[5]); |
---|
939 | BitBlock temp21 = simd_or(bit[6], bit[7]); |
---|
940 | BitBlock temp22 = simd_or(temp20, temp21); |
---|
941 | BitBlock SP = simd_andc(temp19, temp22); |
---|
942 | BitBlock temp23 = simd_andc(bit[3], bit[2]); |
---|
943 | BitBlock temp24 = simd_andc(temp23, temp1); |
---|
944 | BitBlock NEL = simd_and(temp24, temp17); |
---|
945 | WS = simd_or(simd_or(simd_or(CR, LF), simd_or(HT, SP)), NEL); |
---|
946 | } |
---|
947 | |
---|
948 | void EBCDIC_Lexer::Do_XML_11_WS_Control() { |
---|
949 | BitBlock WS, Control; |
---|
950 | for (int i = 0; i < buffer_blocks; i++) { |
---|
951 | EBCDIC_WS_Control_Blocks_11(x8basis[i].bit, WS, Control); |
---|
952 | parsing_engine_data->item_stream[NonWS][i] = simd_not(WS); |
---|
953 | validation_stream[i] = simd_andc(Control, WS); |
---|
954 | } |
---|
955 | }; |
---|
956 | |
---|
957 | |
---|
958 | |
---|
959 | void Lexer_Interface::AnalyzeBuffer(BitBlockBasis * basis, int base_pos, int start_pos, int buffer_limit_pos) { |
---|
960 | #ifdef DEBUG |
---|
961 | printf("Entered AnalyzeBuffer, buffer_limit_pos = %i\n", buffer_limit_pos); |
---|
962 | #endif |
---|
963 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITLEX_ALL) |
---|
964 | code_clocker->start_interval(); |
---|
965 | #endif |
---|
966 | x8basis = basis; |
---|
967 | lexer_base_pos = base_pos; /* for error reporting. */ |
---|
968 | int err_pos; |
---|
969 | buffer_blocks = (buffer_limit_pos + BLOCKSIZE - 1)/BLOCKSIZE; |
---|
970 | buffer_units = buffer_limit_pos; |
---|
971 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == CHARSET_VALIDATION) |
---|
972 | code_clocker->start_interval(); |
---|
973 | #endif |
---|
974 | |
---|
975 | Do_CharsetValidation(); |
---|
976 | /* Ignore error bits before start_pos which only arise |
---|
977 | due to UTF8 pending scope streams at buffer boundaries.*/ |
---|
978 | err_pos = bitstream_scan(validation_stream, start_pos); |
---|
979 | /* Detect validation errors up to the end of file plus one more |
---|
980 | position in case there is an incomplete code unit at EOF. */ |
---|
981 | if ((err_pos <= buffer_units) && (err_pos < BUFFER_SIZE)) { |
---|
982 | // printf("start_pos =%i\n, err_pos = %i\n", start_pos, err_pos); |
---|
983 | // print_bit_block("validation_stream[0]", validation_stream[0]); |
---|
984 | |
---|
985 | // print_bit_block("validation_stream[err_pos/128]", validation_stream[err_pos/128]); |
---|
986 | |
---|
987 | CharSetValidationError((const char *) entity_Info->encoding, lexer_base_pos + err_pos); |
---|
988 | } |
---|
989 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == CHARSET_VALIDATION) |
---|
990 | code_clocker->end_interval(buffer_units); |
---|
991 | #endif |
---|
992 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == WS_CONTROL) |
---|
993 | code_clocker->start_interval(); |
---|
994 | #endif |
---|
995 | |
---|
996 | if (entity_Info->version == XML_1_1) Do_XML_11_WS_Control(); |
---|
997 | else Do_XML_10_WS_Control(); |
---|
998 | #ifdef DEBUG |
---|
999 | printf("Do_WS_Control() complete.\n"); |
---|
1000 | #endif |
---|
1001 | err_pos = bitstream_scan0(validation_stream); |
---|
1002 | if (err_pos < buffer_units) XMLCharacterError(lexer_base_pos + err_pos); |
---|
1003 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == WS_CONTROL) |
---|
1004 | code_clocker->end_interval(buffer_units); |
---|
1005 | #endif |
---|
1006 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == MARKUP_STREAMS) |
---|
1007 | code_clocker->start_interval(); |
---|
1008 | #endif |
---|
1009 | Do_MarkupStreams(); |
---|
1010 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == MARKUP_STREAMS) |
---|
1011 | code_clocker->end_interval(buffer_units); |
---|
1012 | #endif |
---|
1013 | #ifdef DEBUG |
---|
1014 | printf("Do_MarkupStreams() complete.\n"); |
---|
1015 | #endif |
---|
1016 | |
---|
1017 | if (buffer_units < BUFFER_SIZE) { |
---|
1018 | #ifdef TEMPLATED_SIMD_LIB |
---|
1019 | BitBlock final_block_mask = |
---|
1020 | sisd_sfl(simd<1>::constant<1>(), sisd_from_int(buffer_units % BLOCKSIZE)); |
---|
1021 | #endif |
---|
1022 | #ifndef TEMPLATED_SIMD_LIB |
---|
1023 | BitBlock final_block_mask = |
---|
1024 | sisd_sfl(simd_const_1(1), sisd_from_int(buffer_units % BLOCKSIZE)); |
---|
1025 | #endif |
---|
1026 | int lastblk = buffer_units/BLOCKSIZE; |
---|
1027 | for (int j = minLexicalItem; j < LexicalItemCount; j++) { |
---|
1028 | parsing_engine_data->item_stream[j][lastblk] = |
---|
1029 | simd_or(parsing_engine_data->item_stream[j][lastblk], |
---|
1030 | final_block_mask); |
---|
1031 | } |
---|
1032 | } |
---|
1033 | #if defined(PAPI) and defined(CODE_CLOCKING) and (CODE_CLOCKING == BITLEX_ALL) |
---|
1034 | code_clocker->end_interval(buffer_units); |
---|
1035 | #endif |
---|
1036 | |
---|
1037 | } |
---|
1038 | |
---|