source: icGREP/icgrep-devel/icgrep/cc/alphabet.h @ 5796

Last change on this file since 5796 was 5796, checked in by cameron, 17 months ago

Alphabet fixes

File size: 2.3 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef ALPHABET_H
8#define ALPHABET_H
9
10#include <string>
11#include <UCD/unicode_set.h>
12#include <vector>
13
14namespace cc {
15//
16// An Alphabet is the universe of characters used to form strings in
17// a given language, together with a mapping of those characters to
18// numerical character codes.
19//
20
21class Alphabet {
22public:
23    const std::string & getName() const { return mAlphabetName;}
24protected:
25    Alphabet(std::string alphabetName) : mAlphabetName(alphabetName) {}
26private:
27    std::string mAlphabetName;
28};
29
30class UnicodeMappableAlphabet : public Alphabet {
31public:
32    //  Alphabets may be formed by some subset of Unicode characters, together
33    //  with a mapping to and from Unicode.  The mapping is defined in terms of the
34    //  number of character codes unicodeCommon such that all character codes in the range
35    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
36    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
37    //  character codes (if any) above unicodeCommon - 1.
38   
39    UnicodeMappableAlphabet(std::string alphabetName,
40                            unsigned unicodeCommon,
41                            std::vector <UCD::codepoint_t> aboveCommon);
42   
43    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
44    UCD::codepoint_t toUnicode(const unsigned n) const;
45   
46    //  The ordinal position of the character whose Unicode codepoint value is ucp.
47    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
48
49protected:
50    UCD::codepoint_t mCharSet;
51    UCD::codepoint_t mUnicodeCommon;
52    std::vector <UCD::codepoint_t> mAboveCommon;
53};
54
55class CodeUnitAlphabet : public Alphabet {
56public:
57    CodeUnitAlphabet(std::string alphabetName, uint8_t codeUnitBits);
58    uint8_t getCodeUnitBitWidth() { return mCodeUnitBits;}
59   
60private:
61    uint8_t mCodeUnitBits;
62};
63
64//  Some important alphabets are predefined.
65
66const extern UnicodeMappableAlphabet Unicode; 
67
68const extern UnicodeMappableAlphabet ASCII;
69
70const extern UnicodeMappableAlphabet ISO_Latin1;
71
72const extern CodeUnitAlphabet Byte;
73
74}
75
76#endif // ALPHABET_H
77
78
Note: See TracBrowser for help on using the repository browser.