source: branches/parabix-svgopen-2008/src/multiliteral.h @ 214

Last change on this file since 214 was 163, checked in by cameron, 11 years ago

Restructuring: Document/Externalt? Entity Info into xmldecl.h

File size: 9.1 KB
Line 
1/*  multiliteral.h - XML Multicharacter Recognizers.
2    Copyright (c) 2007, 2008, Robert D. Cameron. 
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7This file provides a library of routines for the efficient recognition
8of particular XML multicharacter sequences.  Sequences of length 2 are
9compared as 16 bit integers, sequences of length 3 or 4 are compared
10as 32 bit integers and other sequences of length up to 8 are compared as
1164 bit integers.  The integer value for each XML multicharacter sequence
12is determined as a compile-time constant for optimal efficiency.
13
14All functions are declared inline; there is no corresponding multiliteral.c
15file required.   */
16
17#ifndef MULTILITERAL_H
18#define MULTILITERAL_H
19
20#include <assert.h>
21#include <stdint.h>
22#include "xmldecl.h"
23#include "charsets/ASCII_EBCDIC.h"
24
25#if BYTE_ORDER == BIG_ENDIAN
26const int LOW_BYTE_SHIFT = 8;
27const int HIGH_BYTE_SHIFT = 0;
28#endif
29#if BYTE_ORDER == LITTLE_ENDIAN
30const int LOW_BYTE_SHIFT = 0;
31const int HIGH_BYTE_SHIFT = 8;
32#endif
33
34/*
35Helper metafunctions.  Given 2, 4 or 8 characters comprising a sequence,
36the c2int16, c4int32, and c8int64 functions determine the corresponding
3716, 32 or 64 bit integer value.   These are template metafunctions that
38must be instantiated with constant arguments to be applied at compile time.
39The functions may be instantiated for ASCII or EBCDIC based byte
40sequences.
41For example, c2int16<ASCII, '<', '/'>::value produces the compile
42time constant for the 16-bit value of an ASCII-based byte sequence
43of the XML end tag opening delimiter.
44*/
45
46template <unsigned char byte1, unsigned char byte2>
47struct b2int16 {
48  static uint16_t const value =
49    (((uint16_t) byte1) << LOW_BYTE_SHIFT) +
50    (((uint16_t) byte2) << HIGH_BYTE_SHIFT);
51};
52
53template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
54struct c2int16 {
55  static uint16_t const value = b2int16<Ord<C,c1>::value, Ord<C,c2>::value>::value;
56};
57
58template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
59                      unsigned char c3, unsigned char c4>
60struct c4int32 {
61  static uint32_t const value =
62    (((uint32_t) c2int16<C,c1,c2>::value) << (2 * LOW_BYTE_SHIFT)) + 
63    (((uint32_t) c2int16<C,c3,c4>::value) << (2 * HIGH_BYTE_SHIFT));
64};
65
66template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
67                      unsigned char c3, unsigned char c4,
68                      unsigned char c5, unsigned char c6,
69                      unsigned char c7, unsigned char c8>
70struct c8int64 {
71  static uint64_t const value =
72    (((uint64_t) c4int32<C, c1, c2, c3, c4>::value) << (4 * LOW_BYTE_SHIFT)) + 
73    (((uint64_t) c4int32<C, c5, c6, c7, c8>::value) << (4 * HIGH_BYTE_SHIFT));
74};
75
76
77/*  Specialized helpers for 3, 5, 6, and 7 character combinations. */
78
79template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
80                      unsigned char c3>
81struct c3int32 {
82  static uint32_t const value = c4int32<C, c1, c2, c3, 0>::value;
83};
84
85template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
86                      unsigned char c3, unsigned char c4,
87                      unsigned char c5>
88struct c5int64 {
89  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, 0, 0, 0>::value;
90};
91
92template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
93                      unsigned char c3, unsigned char c4,
94                      unsigned char c5, unsigned char c6>
95struct c6int64 {
96  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, 0, 0>::value;
97};
98
99template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
100                      unsigned char c3, unsigned char c4,
101                      unsigned char c5, unsigned char c6,
102                      unsigned char c7>
103struct c7int64 {
104  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, c7, 0>::value;
105};
106
107
108/*
109A second set of helper functions determines 16, 32, or 64 bit integer
110values from character arrays.
111Precondition:  the character array is allocated with at least the
112number of required characters in each case. */
113static inline uint16_t s2int16(unsigned char s[]) {
114  return * ((uint16_t *) s);
115}
116
117static inline uint32_t s4int32(unsigned char s[]) {
118  return * ((uint32_t *) s);
119}
120
121static inline uint64_t s8int64(unsigned char s[]) {
122  return * ((uint64_t *) s);
123}
124
125static inline uint32_t s3int32(unsigned char s[]) {
126  return s4int32(s) & (0xFFFFFF << LOW_BYTE_SHIFT);
127}
128
129static inline uint64_t s5int64(unsigned char s[]) {
130  return s8int64(s) & (0xFFFFFFFFFFULL << (3 * LOW_BYTE_SHIFT));
131}
132
133static inline uint64_t s6int64(unsigned char s[]) {
134  return s8int64(s) & (0xFFFFFFFFFFFFULL << (2 * LOW_BYTE_SHIFT));
135}
136
137static inline uint64_t s7int64(unsigned char s[]) {
138  return s8int64(s) & (0xFFFFFFFFFFFFFFULL << LOW_BYTE_SHIFT);
139}
140
141template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
142static inline bool caseless_comp(unsigned char s[]) {
143  const uint16_t lc = c2int16<C, UC2lc<c1>::value, UC2lc<c2>::value>::value;
144  const uint16_t UC = c2int16<C, lc2UC<c1>::value, lc2UC<c2>::value>::value;
145  const uint16_t case_mask = lc ^ UC;
146  const uint16_t canon = lc & ~case_mask;
147  return (s2int16(s) & ~case_mask) == canon;
148}
149
150template <CodeUnit_Base C, unsigned char c1, unsigned char c2, unsigned char c3>
151static inline bool caseless_comp(unsigned char s[]) {
152  const uint32_t lc = c3int32<C, UC2lc<c1>::value, UC2lc<c2>::value, UC2lc<c3>::value>::value;
153  const uint32_t UC = c3int32<C, lc2UC<c1>::value, lc2UC<c2>::value, lc2UC<c3>::value>::value;
154  const uint32_t case_mask = lc ^ UC;
155  const uint32_t canon = lc & ~case_mask;
156  return (s3int32(s) & ~case_mask) == canon;
157}
158
159template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
160                           unsigned char c3, unsigned char c4>
161static inline bool caseless_comp(unsigned char s[]) {
162  const uint32_t lc = c4int32<C, UC2lc<c1>::value, UC2lc<c2>::value,
163                                 UC2lc<c3>::value, UC2lc<c4>::value>::value;
164  const uint32_t UC = c4int32<C, lc2UC<c1>::value, lc2UC<c2>::value, 
165                                 lc2UC<c3>::value, lc2UC<c4>::value>::value; 
166  const uint32_t case_mask = lc ^ UC;
167  const uint32_t canon = lc & ~case_mask;
168  return (s4int32(s) & ~case_mask) == canon;
169}
170
171template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
172                           unsigned char c3, unsigned char c4,
173                           unsigned char c5>
174static inline bool caseless_comp(unsigned char s[]) {
175  const uint64_t lc = c5int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
176                                 UC2lc<c3>::value, UC2lc<c4>::value,
177                                 UC2lc<c5>::value>::value;
178  const uint64_t UC = c5int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
179                                 lc2UC<c3>::value, lc2UC<c4>::value, 
180                                 lc2UC<c5>::value>::value; 
181  const uint64_t case_mask = lc ^ UC;
182  const uint64_t canon = lc & ~case_mask;
183  return (s5int64(s) & ~case_mask) == canon;
184}
185
186template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
187                           unsigned char c3, unsigned char c4,
188                           unsigned char c5, unsigned char c6>
189static inline bool caseless_comp(unsigned char s[]) {
190  const uint64_t lc = c6int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
191                                 UC2lc<c3>::value, UC2lc<c4>::value,
192                                 UC2lc<c5>::value, UC2lc<c6>::value>::value;
193  const uint64_t UC = c6int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
194                                 lc2UC<c3>::value, lc2UC<c4>::value, 
195                                 lc2UC<c5>::value, lc2UC<c6>::value>::value; 
196  const uint64_t case_mask = lc ^ UC;
197  const uint64_t canon = lc & ~case_mask;
198  return (s6int64(s) & ~case_mask) == canon;
199}
200
201template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
202                           unsigned char c3, unsigned char c4,
203                           unsigned char c5, unsigned char c6,
204                           unsigned char c7>
205static inline bool caseless_comp(unsigned char s[]) {
206  const uint64_t lc = c7int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
207                                 UC2lc<c3>::value, UC2lc<c4>::value,
208                                 UC2lc<c5>::value, UC2lc<c6>::value,
209                                 UC2lc<c7>::value>::value;
210  const uint64_t UC = c7int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
211                                 lc2UC<c3>::value, lc2UC<c4>::value, 
212                                 lc2UC<c5>::value, lc2UC<c6>::value, 
213                                 lc2UC<c7>::value>::value; 
214  const uint64_t case_mask = lc ^ UC;
215  const uint64_t canon = lc & ~case_mask;
216  return (s7int64(s) & ~case_mask) == canon;
217}
218
219template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
220                           unsigned char c3, unsigned char c4,
221                           unsigned char c5, unsigned char c6,
222                           unsigned char c7, unsigned char c8>
223static inline bool caseless_comp(unsigned char s[]) {
224  const uint64_t lc = c8int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
225                                 UC2lc<c3>::value, UC2lc<c4>::value,
226                                 UC2lc<c5>::value, UC2lc<c6>::value,
227                                 UC2lc<c7>::value, UC2lc<c8>::value>::value;
228  const uint64_t UC = c8int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
229                                 lc2UC<c3>::value, lc2UC<c4>::value, 
230                                 lc2UC<c5>::value, lc2UC<c6>::value, 
231                                 lc2UC<c7>::value, lc2UC<c8>::value>::value; 
232  const uint64_t case_mask = lc ^ UC;
233  const uint64_t canon = lc & ~case_mask;
234  return (s8int64(s) & ~case_mask) == canon;
235}
236
237
238
239#endif
Note: See TracBrowser for help on using the repository browser.