source: icGREP/icgrep-devel/icgrep/cc/alphabet.h @ 5814

Last change on this file since 5814 was 5800, checked in by cameron, 16 months ago

isa and dyn_cast for Alphabets

File size: 3.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef ALPHABET_H
8#define ALPHABET_H
9
10#include <string>
11#include <UCD/unicode_set.h>
12#include <vector>
13
14namespace cc {
15//
16// An Alphabet is the universe of characters used to form strings in
17// a given language, together with a mapping of those characters to
18// numerical character codes.
19//
20
21class Alphabet {
22public:
23    const std::string & getName() const { return mAlphabetName;}
24    enum class ClassTypeId : unsigned {UnicodeMappableAlphabet, CodeUnitAlphabet, MultiplexedAlphabet};
25    inline ClassTypeId getClassTypeId() const {
26        return mClassTypeId;
27    }
28
29protected:
30    Alphabet(std::string name, ClassTypeId k) : mAlphabetName(name), mClassTypeId(k) {}
31private:
32    const std::string mAlphabetName;
33    const ClassTypeId mClassTypeId;
34};
35
36class UnicodeMappableAlphabet : public Alphabet {
37public:
38    //  Alphabets may be formed by some subset of Unicode characters, together
39    //  with a mapping to and from Unicode.  The mapping is defined in terms of unicodeCommon:
40    //  the number of character codes (if any) such that all character codes in the range
41    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
42    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
43    //  character codes (if any) above unicodeCommon - 1.
44   
45    UnicodeMappableAlphabet(std::string alphabetName,
46                            unsigned unicodeCommon,
47                            std::vector <UCD::codepoint_t> aboveCommon);
48   
49    static inline bool classof(const Alphabet * a) {
50        return a->getClassTypeId() == ClassTypeId::UnicodeMappableAlphabet;
51    }
52    static inline bool classof(const void *) {return false;}
53    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
54    UCD::codepoint_t toUnicode(const unsigned n) const;
55   
56    //  The ordinal position of the character whose Unicode codepoint value is ucp.
57    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
58
59protected:
60    UCD::codepoint_t mCharSet;
61    UCD::codepoint_t mUnicodeCommon;
62    std::vector <UCD::codepoint_t> mAboveCommon;
63};
64
65class CodeUnitAlphabet : public Alphabet {
66public:
67    CodeUnitAlphabet(std::string alphabetName, uint8_t codeUnitBits);
68    static inline bool classof(const Alphabet * a) {
69        return a->getClassTypeId() == ClassTypeId::CodeUnitAlphabet;
70    }
71    static inline bool classof(const void *) {return false;}
72    uint8_t getCodeUnitBitWidth() { return mCodeUnitBits;}
73   
74private:
75    uint8_t mCodeUnitBits;
76};
77
78//  Some important alphabets are predefined.
79
80const extern UnicodeMappableAlphabet Unicode; // Unicode("Unicode", UCD::UNICODE_MAX, {})
81
82const extern UnicodeMappableAlphabet ASCII;  // ASCII("ASCII", 0x7F, {});
83
84const extern UnicodeMappableAlphabet ISO_Latin1; // ISO_Latin1("ISO_Latin1", 0xFF, {});
85
86const extern CodeUnitAlphabet Byte; // Byte("Byte", 8);
87   
88const extern CodeUnitAlphabet UTF16; // UTF16("UTF16", 16);
89   
90}
91
92#endif // ALPHABET_H
93
94
Note: See TracBrowser for help on using the repository browser.