source: icGREP/icgrep-devel/icgrep/cc/alphabet.h

Last change on this file was 6297, checked in by cameron, 3 weeks ago

Merge branch 'master' of https://cs-git-research.cs.surrey.sfu.ca/cameron/parabix-devel

File size: 3.8 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef ALPHABET_H
8#define ALPHABET_H
9
10#include <string>
11#include <UCD/unicode_set.h>
12#include <vector>
13
14namespace cc {
15   
16// Set of parallel bit streams may be numbered using either the
17// LittleEndian (right-to-left) or BigEndian (left-to-right) conventions.
18enum class BitNumbering {LittleEndian, BigEndian};
19
20inline std::string numberingSuffix(BitNumbering numbering) {
21    return (numbering == BitNumbering::LittleEndian) ? "-LE" : "-BE";
22}
23
24//
25// An Alphabet is the universe of characters used to form strings in
26// a given language, together with a mapping of those characters to
27// numerical character codes.
28//
29
30class Alphabet {
31public:
32    const std::string & getName() const { return mAlphabetName;}
33    virtual const unsigned getSize() const = 0;
34    enum class ClassTypeId : unsigned {UnicodeMappableAlphabet, CodeUnitAlphabet, MultiplexedAlphabet};
35    inline ClassTypeId getClassTypeId() const {
36        return mClassTypeId;
37    }
38    virtual ~Alphabet() {}
39protected:
40    Alphabet(const std::string && name, ClassTypeId k) : mAlphabetName(std::move(name)), mClassTypeId(k) {}
41private:
42    const std::string mAlphabetName;
43    const ClassTypeId mClassTypeId;
44};
45
46class UnicodeMappableAlphabet final : public Alphabet {
47public:
48    //  Alphabets may be formed by some subset of Unicode characters, together
49    //  with a mapping to and from Unicode.  The mapping is defined in terms of unicodeCommon:
50    //  the number of character codes (if any) such that all character codes in the range
51    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
52    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
53    //  character codes (if any) above unicodeCommon - 1.
54   
55    UnicodeMappableAlphabet(const std::string alphabetName,
56                            unsigned unicodeCommon,
57                            std::vector <UCD::codepoint_t> aboveCommon);
58   
59    static inline bool classof(const Alphabet * a) {
60        return a->getClassTypeId() == ClassTypeId::UnicodeMappableAlphabet;
61    }
62    static inline bool classof(const void *) {return false;}
63    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
64    UCD::codepoint_t toUnicode(const unsigned n) const;
65   
66    //  The ordinal position of the character whose Unicode codepoint value is ucp.
67    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
68
69    const unsigned getSize() const override {return mUnicodeCommon + mAboveCommon.size();}
70
71protected:
72    const UCD::codepoint_t mUnicodeCommon;
73    const std::vector<UCD::codepoint_t> mAboveCommon;
74};
75
76class CodeUnitAlphabet final : public Alphabet {
77public:
78    CodeUnitAlphabet(const std::string alphabetName, uint8_t codeUnitBits);
79    static inline bool classof(const Alphabet * a) {
80        return a->getClassTypeId() == ClassTypeId::CodeUnitAlphabet;
81    }
82    static inline bool classof(const void *) {return false;}
83    uint8_t getCodeUnitBitWidth() const { return mCodeUnitBits;}
84    const unsigned getSize() const override {return 1<<mCodeUnitBits;}
85
86private:
87    const uint8_t mCodeUnitBits;
88};
89
90//  Some important alphabets are predefined.
91
92const extern UnicodeMappableAlphabet Unicode; // Unicode("Unicode", UCD::UNICODE_MAX, {})
93
94const extern UnicodeMappableAlphabet ASCII;  // ASCII("ASCII", 0x7F, {});
95
96const extern UnicodeMappableAlphabet ISO_Latin1; // ISO_Latin1("ISO_Latin1", 0xFF, {});
97
98const extern CodeUnitAlphabet Byte; // Byte("Byte", 8);
99   
100const extern CodeUnitAlphabet UTF8; // UTF8("UTF8", 8);
101
102const extern CodeUnitAlphabet UTF16; // UTF16("UTF16", 16);
103   
104}
105
106#endif // ALPHABET_H
107
108
Note: See TracBrowser for help on using the repository browser.