source: trunk/src/charsets/charset_family.h @ 35

Last change on this file since 35 was 6, checked in by cameron, 12 years ago

Ext_ASCII_16LE/charset_family updates.

File size: 2.2 KB
Line 
1/*  charset_family.h
2    Copyright (c) 2007, Robert D. Cameron.
3    Licensed to the public under the Open Software License 3.0.
4    Licensed to International Characters, Inc., under the Academic
5    Free License 3.0.
6
7    The possible character sets that may be used with a XML
8    document are classified into families based on
9      (a) code unit size: 8, 16, or 32 bits,
10      (b) byte ordering: big-endian, little-endian or
11          32-bit unusual octet orders: 2341 and 3412,
12      (c) compatible single byte subset: ASCII or EBCDIC
13
14    The Ext_ASCII_8 family is very extensive, including UTF-8 and
15    the various ISO Latin character sets, among others.
16
17    Detection based on the first 4 bytes of an XML entity is
18    carried out in general accord with Appendix F of the XML spec,
19    although in the case of some erroneous documents, the
20    classifications differ.
21
22    The unusual octet order families are included only because of
23    their mention in Appendix F.
24
25*/
26#ifndef CHARSET_FAMILY_H
27#define CHARSET_FAMILY_H
28
29#include "../multiliteral.h"
30
31enum Charset_Family {Ext_ASCII_8, Ext_ASCII_16LE, Ext_ASCII_16BE,
32                     Ext_ASCII_32LE, Ext_ASCII_32BE, 
33                     Ext_ASCII_32_2143, Ext_ASCII_32_3412,
34                     EBCDIC_family_8};
35
36#if (BYTE_ORDER == BIG_ENDIAN)
37#define x003C 0x003C
38#define xFEFF 0xFEFF
39#define x3C00 0x3C00
40#define xFFFE 0xFFFE
41#define x4C6F 0x4C6F
42#define xA794 0xA794
43#endif
44#if (BYTE_ORDER == LITTLE_ENDIAN)
45#define x003C 0x3C00
46#define xFEFF 0xFFFE
47#define x3C00 0x003c
48#define xFFFE 0xFEFF
49#define x4C6F 0x6F4C
50#define xA794 0x94A7
51#endif
52
53                                     
54Charset_Family Charset_Family_Detect(unsigned char* XML_entity) {
55        uint16_t * XML_dbl_byte = (uint16_t *) XML_entity;
56        switch (XML_dbl_byte[0]) {
57                case 0x0000:
58                        if ((XML_dbl_byte[1] == x3C00) || (XML_dbl_byte[1] = xFFFE))
59                                return Ext_ASCII_32_2143;
60                        else return Ext_ASCII_32BE;
61                case x003C: case xFEFF:
62                        if (XML_dbl_byte[1] == 0) return Ext_ASCII_32_3412;
63                        else return Ext_ASCII_16BE;
64                case x3C00: case xFFFE:
65                        if (XML_dbl_byte[1] == 0) return Ext_ASCII_32LE;
66                        else return Ext_ASCII_16LE;
67                case x4C6F:
68                        if (XML_dbl_byte[1] == xA794) return EBCDIC_family_8;
69                        else return Ext_ASCII_8;
70                default:
71                        return Ext_ASCII_8;
72        }
73}
74
75#endif
Note: See TracBrowser for help on using the repository browser.