source: trunk/src/multiliteral.h @ 88

Last change on this file since 88 was 88, checked in by cameron, 11 years ago

stdint.h for MSVC

File size: 9.1 KB
Line 
1/*  multiliteral.h - XML Multicharacter Recognizers.
2    Copyright (c) 2007, 2008, Robert D. Cameron. 
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7This file provides a library of routines for the efficient recognition
8of particular XML multicharacter sequences.  Sequences of length 2 are
9compared as 16 bit integers, sequences of length 3 or 4 are compared
10as 32 bit integers and other sequences of length up to 8 are compared as
1164 bit integers.  The integer value for each XML multicharacter sequence
12is determined as a compile-time constant for optimal efficiency.
13
14All functions are declared inline; there is no corresponding multiliteral.c
15file required.   */
16
17#ifndef MULTILITERAL_H
18#define MULTILITERAL_H
19
20#include <assert.h>
21#ifndef _MSC_VER
22#include <stdint.h>
23#endif
24#ifdef _MSC_VER
25#include "../lib/stdint.h"
26#endif
27#include "xmlmodel.h"
28#include "charsets/ASCII_EBCDIC.h"
29
30#if BYTE_ORDER == BIG_ENDIAN
31const int LOW_BYTE_SHIFT = 8;
32const int HIGH_BYTE_SHIFT = 0;
33#endif
34#if BYTE_ORDER == LITTLE_ENDIAN
35const int LOW_BYTE_SHIFT = 0;
36const int HIGH_BYTE_SHIFT = 8;
37#endif
38
39/*
40Helper metafunctions.  Given 2, 4 or 8 characters comprising a sequence,
41the c2int16, c4int32, and c8int64 functions determine the corresponding
4216, 32 or 64 bit integer value.   These are template metafunctions that
43must be instantiated with constant arguments to be applied at compile time.
44The functions may be instantiated for ASCII or EBCDIC based byte
45sequences.
46For example, c2int16<ASCII, '<', '/'>::value produces the compile
47time constant for the 16-bit value of an ASCII-based byte sequence
48of the XML end tag opening delimiter.
49*/
50
51template <unsigned char byte1, unsigned char byte2>
52struct b2int16 {
53  static uint16_t const value =
54    (((uint16_t) byte1) << LOW_BYTE_SHIFT) +
55    (((uint16_t) byte2) << HIGH_BYTE_SHIFT);
56};
57
58template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
59struct c2int16 {
60  static uint16_t const value = b2int16<Ord<C,c1>::value, Ord<C,c2>::value>::value;
61};
62
63template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
64                      unsigned char c3, unsigned char c4>
65struct c4int32 {
66  static uint32_t const value =
67    (((uint32_t) c2int16<C,c1,c2>::value) << (2 * LOW_BYTE_SHIFT)) + 
68    (((uint32_t) c2int16<C,c3,c4>::value) << (2 * HIGH_BYTE_SHIFT));
69};
70
71template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
72                      unsigned char c3, unsigned char c4,
73                      unsigned char c5, unsigned char c6,
74                      unsigned char c7, unsigned char c8>
75struct c8int64 {
76  static uint64_t const value =
77    (((uint64_t) c4int32<C, c1, c2, c3, c4>::value) << (4 * LOW_BYTE_SHIFT)) + 
78    (((uint64_t) c4int32<C, c5, c6, c7, c8>::value) << (4 * HIGH_BYTE_SHIFT));
79};
80
81
82/*  Specialized helpers for 3, 5, 6, and 7 character combinations. */
83
84template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
85                      unsigned char c3>
86struct c3int32 {
87  static uint32_t const value = c4int32<C, c1, c2, c3, 0>::value;
88};
89
90template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
91                      unsigned char c3, unsigned char c4,
92                      unsigned char c5>
93struct c5int64 {
94  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, 0, 0, 0>::value;
95};
96
97template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
98                      unsigned char c3, unsigned char c4,
99                      unsigned char c5, unsigned char c6>
100struct c6int64 {
101  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, 0, 0>::value;
102};
103
104template <CodeUnit_Base C, unsigned char c1, unsigned char c2,
105                      unsigned char c3, unsigned char c4,
106                      unsigned char c5, unsigned char c6,
107                      unsigned char c7>
108struct c7int64 {
109  static uint64_t const value = c8int64<C, c1, c2, c3, c4, c5, c6, c7, 0>::value;
110};
111
112
113/*
114A second set of helper functions determines 16, 32, or 64 bit integer
115values from character arrays.
116Precondition:  the character array is allocated with at least the
117number of required characters in each case. */
118static inline uint16_t s2int16(unsigned char s[]) {
119  return * ((uint16_t *) s);
120}
121
122static inline uint32_t s4int32(unsigned char s[]) {
123  return * ((uint32_t *) s);
124}
125
126static inline uint64_t s8int64(unsigned char s[]) {
127  return * ((uint64_t *) s);
128}
129
130static inline uint32_t s3int32(unsigned char s[]) {
131  return s4int32(s) & (0xFFFFFF << LOW_BYTE_SHIFT);
132}
133
134static inline uint64_t s5int64(unsigned char s[]) {
135  return s8int64(s) & (0xFFFFFFFFFFULL << (3 * LOW_BYTE_SHIFT));
136}
137
138static inline uint64_t s6int64(unsigned char s[]) {
139  return s8int64(s) & (0xFFFFFFFFFFFFULL << (2 * LOW_BYTE_SHIFT));
140}
141
142static inline uint64_t s7int64(unsigned char s[]) {
143  return s8int64(s) & (0xFFFFFFFFFFFFFFULL << LOW_BYTE_SHIFT);
144}
145
146template <CodeUnit_Base C, unsigned char c1, unsigned char c2>
147static inline bool caseless_comp(unsigned char s[]) {
148  const uint16_t lc = c2int16<C, UC2lc<c1>::value, UC2lc<c2>::value>::value;
149  const uint16_t UC = c2int16<C, lc2UC<c1>::value, lc2UC<c2>::value>::value;
150  const uint16_t case_mask = lc ^ UC;
151  const uint16_t canon = lc & ~case_mask;
152  return (s2int16(s) & ~case_mask) == canon;
153}
154
155template <CodeUnit_Base C, unsigned char c1, unsigned char c2, unsigned char c3>
156static inline bool caseless_comp(unsigned char s[]) {
157  const uint32_t lc = c3int32<C, UC2lc<c1>::value, UC2lc<c2>::value, UC2lc<c3>::value>::value;
158  const uint32_t UC = c3int32<C, lc2UC<c1>::value, lc2UC<c2>::value, lc2UC<c3>::value>::value;
159  const uint32_t case_mask = lc ^ UC;
160  const uint32_t canon = lc & ~case_mask;
161  return (s3int32(s) & ~case_mask) == canon;
162}
163
164template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
165                           unsigned char c3, unsigned char c4>
166static inline bool caseless_comp(unsigned char s[]) {
167  const uint32_t lc = c4int32<C, UC2lc<c1>::value, UC2lc<c2>::value,
168                                 UC2lc<c3>::value, UC2lc<c4>::value>::value;
169  const uint32_t UC = c4int32<C, lc2UC<c1>::value, lc2UC<c2>::value, 
170                                 lc2UC<c3>::value, lc2UC<c4>::value>::value; 
171  const uint32_t case_mask = lc ^ UC;
172  const uint32_t canon = lc & ~case_mask;
173  return (s4int32(s) & ~case_mask) == canon;
174}
175
176template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
177                           unsigned char c3, unsigned char c4,
178                           unsigned char c5>
179static inline bool caseless_comp(unsigned char s[]) {
180  const uint64_t lc = c5int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
181                                 UC2lc<c3>::value, UC2lc<c4>::value,
182                                 UC2lc<c5>::value>::value;
183  const uint64_t UC = c5int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
184                                 lc2UC<c3>::value, lc2UC<c4>::value, 
185                                 lc2UC<c5>::value>::value; 
186  const uint64_t case_mask = lc ^ UC;
187  const uint64_t canon = lc & ~case_mask;
188  return (s5int64(s) & ~case_mask) == canon;
189}
190
191template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
192                           unsigned char c3, unsigned char c4,
193                           unsigned char c5, unsigned char c6>
194static inline bool caseless_comp(unsigned char s[]) {
195  const uint64_t lc = c6int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
196                                 UC2lc<c3>::value, UC2lc<c4>::value,
197                                 UC2lc<c5>::value, UC2lc<c6>::value>::value;
198  const uint64_t UC = c6int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
199                                 lc2UC<c3>::value, lc2UC<c4>::value, 
200                                 lc2UC<c5>::value, lc2UC<c6>::value>::value; 
201  const uint64_t case_mask = lc ^ UC;
202  const uint64_t canon = lc & ~case_mask;
203  return (s6int64(s) & ~case_mask) == canon;
204}
205
206template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
207                           unsigned char c3, unsigned char c4,
208                           unsigned char c5, unsigned char c6,
209                           unsigned char c7>
210static inline bool caseless_comp(unsigned char s[]) {
211  const uint64_t lc = c7int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
212                                 UC2lc<c3>::value, UC2lc<c4>::value,
213                                 UC2lc<c5>::value, UC2lc<c6>::value,
214                                 UC2lc<c7>::value>::value;
215  const uint64_t UC = c7int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
216                                 lc2UC<c3>::value, lc2UC<c4>::value, 
217                                 lc2UC<c5>::value, lc2UC<c6>::value, 
218                                 lc2UC<c7>::value>::value; 
219  const uint64_t case_mask = lc ^ UC;
220  const uint64_t canon = lc & ~case_mask;
221  return (s7int64(s) & ~case_mask) == canon;
222}
223
224template <CodeUnit_Base C, unsigned char c1, unsigned char c2, 
225                           unsigned char c3, unsigned char c4,
226                           unsigned char c5, unsigned char c6,
227                           unsigned char c7, unsigned char c8>
228static inline bool caseless_comp(unsigned char s[]) {
229  const uint64_t lc = c8int64<C, UC2lc<c1>::value, UC2lc<c2>::value,
230                                 UC2lc<c3>::value, UC2lc<c4>::value,
231                                 UC2lc<c5>::value, UC2lc<c6>::value,
232                                 UC2lc<c7>::value, UC2lc<c8>::value>::value;
233  const uint64_t UC = c8int64<C, lc2UC<c1>::value, lc2UC<c2>::value, 
234                                 lc2UC<c3>::value, lc2UC<c4>::value, 
235                                 lc2UC<c5>::value, lc2UC<c6>::value, 
236                                 lc2UC<c7>::value, lc2UC<c8>::value>::value; 
237  const uint64_t case_mask = lc ^ UC;
238  const uint64_t canon = lc & ~case_mask;
239  return (s8int64(s) & ~case_mask) == canon;
240}
241
242
243
244#endif
Note: See TracBrowser for help on using the repository browser.