source: proto/parabix2/src/multiliteral.h @ 1496

Last change on this file since 1496 was 1228, checked in by vla24, 8 years ago

Integrated symbol table with xmlwf. There are various implementations for the symbol table, please read /proto/SymbolTable/README_SymbolTable for more information.

File size: 9.1 KB
RevLine 
[424]1/*  multiliteral.h - XML Multicharacter Recognizers.
2    Copyright (c) 2007, 2008, Robert D. Cameron. 
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7This file provides a library of routines for the efficient recognition
8of particular XML multicharacter sequences.  Sequences of length 2 are
9compared as 16 bit integers, sequences of length 3 or 4 are compared
10as 32 bit integers and other sequences of length up to 8 are compared as
1164 bit integers.  The integer value for each XML multicharacter sequence
12is determined as a compile-time constant for optimal efficiency.
13
14All functions are declared inline; there is no corresponding multiliteral.c
15file required.   */
16
17#ifndef MULTILITERAL_H
18#define MULTILITERAL_H
19
20#include <assert.h>
21#include <stdint.h>
22#include "xmldecl.h"
23#include "charsets/ASCII_EBCDIC.h"
24
[1228]25#if (BYTE_ORDER == BIG_ENDIAN)& !defined(BYTE_SHIFT)
26#define BYTE_SHIFT
27#define LOW_BYTE_SHIFT 8
28#define HIGH_BYTE_SHIFT 0
[424]29#endif
[1228]30
31#if (BYTE_ORDER == LITTLE_ENDIAN)& !defined(BYTE_SHIFT)
32#define BYTE_SHIFT
33#define LOW_BYTE_SHIFT 0
34#define HIGH_BYTE_SHIFT 8
[424]35#endif
36
[1228]37
[424]38/*
39Helper metafunctions.  Given 2, 4 or 8 characters comprising a sequence,
40the c2int16, c4int32, and c8int64 functions determine the corresponding
4116, 32 or 64 bit integer value.   These are template metafunctions that
42must be instantiated with constant arguments to be applied at compile time.
43The functions may be instantiated for ASCII or EBCDIC based byte
44sequences.
45For example, c2int16<ASCII, '<', '/'>::value produces the compile
46time constant for the 16-bit value of an ASCII-based byte sequence
47of the XML end tag opening delimiter.
48*/
49
50template <unsigned char byte1, unsigned char byte2>
51struct b2int16 {
52  static uint16_t const value =
53    (((uint16_t) byte1) << LOW_BYTE_SHIFT) +
54    (((uint16_t) byte2) << HIGH_BYTE_SHIFT);
55};
56
57template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
58struct c2int16 {
59  static uint16_t const value = b2int16<Ord<C,c1>::value, Ord<C,c2>::value>::value;
60};
61
62template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
63                      unsigned char c3, unsigned char c4>
64struct c4int32 {
65  static uint32_t const value =
66    (((uint32_t) c2int16<C,c1,c2>::value) << (2 * LOW_BYTE_SHIFT)) + 
67    (((uint32_t) c2int16<C,c3,c4>::value) << (2 * HIGH_BYTE_SHIFT));
68};
69
70template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
71                      unsigned char c3, unsigned char c4,
72                      unsigned char c5, unsigned char c6,
73                      unsigned char c7, unsigned char c8>
74struct c8int64 {
75  static uint64_t const value =
76    (((uint64_t) c4int32<C, c1, c2, c3, c4>::value) << (4 * LOW_BYTE_SHIFT)) + 
77    (((uint64_t) c4int32<C, c5, c6, c7, c8>::value) << (4 * HIGH_BYTE_SHIFT));
78};
79
80
81/*  Specialized helpers for 3, 5, 6, and 7 character combinations. */
82
83template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
84                      unsigned char c3>
85struct c3int32 {
86  static uint32_t const value = c4int32<C, c1, c2, c3, 0>::value;
87};
88
89template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
90                      unsigned char c3, unsigned char c4,
91                      unsigned char c5>
92struct c5int64 {
93  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, 0, 0, 0>::value;
94};
95
96template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
97                      unsigned char c3, unsigned char c4,
98                      unsigned char c5, unsigned char c6>
99struct c6int64 {
100  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, 0, 0>::value;
101};
102
103template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
104                      unsigned char c3, unsigned char c4,
105                      unsigned char c5, unsigned char c6,
106                      unsigned char c7>
107struct c7int64 {
108  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, c7, 0>::value;
109};
110
111
112/*
113A second set of helper functions determines 16, 32, or 64 bit integer
114values from character arrays.
115Precondition:  the character array is allocated with at least the
116number of required characters in each case. */
117static inline uint16_t s2int16(unsigned char s[]) {
118  return * ((uint16_t *) s);
119}
120
121static inline uint32_t s4int32(unsigned char s[]) {
122  return * ((uint32_t *) s);
123}
124
125static inline uint64_t s8int64(unsigned char s[]) {
126  return * ((uint64_t *) s);
127}
128
129static inline uint32_t s3int32(unsigned char s[]) {
130  return s4int32(s) & (0xFFFFFF << LOW_BYTE_SHIFT);
131}
132
133static inline uint64_t s5int64(unsigned char s[]) {
134  return s8int64(s) & (0xFFFFFFFFFFULL << (3 * LOW_BYTE_SHIFT));
135}
136
137static inline uint64_t s6int64(unsigned char s[]) {
138  return s8int64(s) & (0xFFFFFFFFFFFFULL << (2 * LOW_BYTE_SHIFT));
139}
140
141static inline uint64_t s7int64(unsigned char s[]) {
142  return s8int64(s) & (0xFFFFFFFFFFFFFFULL << LOW_BYTE_SHIFT);
143}
144
145template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
146static inline bool caseless_comp(unsigned char s[]) {
147  const uint16_t lc = c2int16<C, UC2lc<c1>::value, UC2lc<c2>::value>::value;
148  const uint16_t UC = c2int16<C, lc2UC<c1>::value, lc2UC<c2>::value>::value;
149  const uint16_t case_mask = lc ^ UC;
150  const uint16_t canon = lc & ~case_mask;
151  return (s2int16(s) & ~case_mask) == canon;
152}
153
154template <CodeUnit_Base C, unsigned char c1, unsigned char c2, unsigned char c3>
155static inline bool caseless_comp(unsigned char s[]) {
156  const uint32_t lc = c3int32<C, UC2lc<c1>::value, UC2lc<c2>::value, UC2lc<c3>::value>::value;
157  const uint32_t UC = c3int32<C, lc2UC<c1>::value, lc2UC<c2>::value, lc2UC<c3>::value>::value;
158  const uint32_t case_mask = lc ^ UC;
159  const uint32_t canon = lc & ~case_mask;
160  return (s3int32(s) & ~case_mask) == canon;
161}
162
163template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
164                           unsigned char c3, unsigned char c4>
165static inline bool caseless_comp(unsigned char s[]) {
166  const uint32_t lc = c4int32<C, UC2lc<c1>::value, UC2lc<c2>::value,
167                                 UC2lc<c3>::value, UC2lc<c4>::value>::value;
168  const uint32_t UC = c4int32<C, lc2UC<c1>::value, lc2UC<c2>::value, 
169                                 lc2UC<c3>::value, lc2UC<c4>::value>::value; 
170  const uint32_t case_mask = lc ^ UC;
171  const uint32_t canon = lc & ~case_mask;
172  return (s4int32(s) & ~case_mask) == canon;
173}
174
175template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
176                           unsigned char c3, unsigned char c4,
177                           unsigned char c5>
178static inline bool caseless_comp(unsigned char s[]) {
179  const uint64_t lc = c5int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
180                                 UC2lc<c3>::value, UC2lc<c4>::value,
181                                 UC2lc<c5>::value>::value;
182  const uint64_t UC = c5int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
183                                 lc2UC<c3>::value, lc2UC<c4>::value, 
184                                 lc2UC<c5>::value>::value; 
185  const uint64_t case_mask = lc ^ UC;
186  const uint64_t canon = lc & ~case_mask;
187  return (s5int64(s) & ~case_mask) == canon;
188}
189
190template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
191                           unsigned char c3, unsigned char c4,
192                           unsigned char c5, unsigned char c6>
193static inline bool caseless_comp(unsigned char s[]) {
194  const uint64_t lc = c6int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
195                                 UC2lc<c3>::value, UC2lc<c4>::value,
196                                 UC2lc<c5>::value, UC2lc<c6>::value>::value;
197  const uint64_t UC = c6int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
198                                 lc2UC<c3>::value, lc2UC<c4>::value, 
199                                 lc2UC<c5>::value, lc2UC<c6>::value>::value; 
200  const uint64_t case_mask = lc ^ UC;
201  const uint64_t canon = lc & ~case_mask;
202  return (s6int64(s) & ~case_mask) == canon;
203}
204
205template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
206                           unsigned char c3, unsigned char c4,
207                           unsigned char c5, unsigned char c6,
208                           unsigned char c7>
209static inline bool caseless_comp(unsigned char s[]) {
210  const uint64_t lc = c7int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
211                                 UC2lc<c3>::value, UC2lc<c4>::value,
212                                 UC2lc<c5>::value, UC2lc<c6>::value,
213                                 UC2lc<c7>::value>::value;
214  const uint64_t UC = c7int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
215                                 lc2UC<c3>::value, lc2UC<c4>::value, 
216                                 lc2UC<c5>::value, lc2UC<c6>::value, 
217                                 lc2UC<c7>::value>::value; 
218  const uint64_t case_mask = lc ^ UC;
219  const uint64_t canon = lc & ~case_mask;
220  return (s7int64(s) & ~case_mask) == canon;
221}
222
223template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
224                           unsigned char c3, unsigned char c4,
225                           unsigned char c5, unsigned char c6,
226                           unsigned char c7, unsigned char c8>
227static inline bool caseless_comp(unsigned char s[]) {
228  const uint64_t lc = c8int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
229                                 UC2lc<c3>::value, UC2lc<c4>::value,
230                                 UC2lc<c5>::value, UC2lc<c6>::value,
231                                 UC2lc<c7>::value, UC2lc<c8>::value>::value;
232  const uint64_t UC = c8int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
233                                 lc2UC<c3>::value, lc2UC<c4>::value, 
234                                 lc2UC<c5>::value, lc2UC<c6>::value, 
235                                 lc2UC<c7>::value, lc2UC<c8>::value>::value; 
236  const uint64_t case_mask = lc ^ UC;
237  const uint64_t canon = lc & ~case_mask;
238  return (s8int64(s) & ~case_mask) == canon;
239}
240
241
242
243#endif
Note: See TracBrowser for help on using the repository browser.