source: icGREP/icgrep-devel/icgrep/cc/alphabet.h @ 5816

Last change on this file since 5816 was 5816, checked in by cameron, 16 months ago

Supporting multiple alphabets in RE compilation - initial check-in

File size: 3.3 KB
RevLine 
[5279]1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef ALPHABET_H
8#define ALPHABET_H
9
10#include <string>
11#include <UCD/unicode_set.h>
[5795]12#include <vector>
[5279]13
[5795]14namespace cc {
[5279]15//
16// An Alphabet is the universe of characters used to form strings in
17// a given language, together with a mapping of those characters to
18// numerical character codes.
19//
20
21class Alphabet {
22public:
[5795]23    const std::string & getName() const { return mAlphabetName;}
[5816]24    virtual const unsigned getSize() const = 0;
[5800]25    enum class ClassTypeId : unsigned {UnicodeMappableAlphabet, CodeUnitAlphabet, MultiplexedAlphabet};
26    inline ClassTypeId getClassTypeId() const {
27        return mClassTypeId;
28    }
29
[5795]30protected:
[5800]31    Alphabet(std::string name, ClassTypeId k) : mAlphabetName(name), mClassTypeId(k) {}
[5795]32private:
[5800]33    const std::string mAlphabetName;
34    const ClassTypeId mClassTypeId;
[5795]35};
[5279]36
[5795]37class UnicodeMappableAlphabet : public Alphabet {
38public:
39    //  Alphabets may be formed by some subset of Unicode characters, together
[5797]40    //  with a mapping to and from Unicode.  The mapping is defined in terms of unicodeCommon:
41    //  the number of character codes (if any) such that all character codes in the range
[5795]42    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
43    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
44    //  character codes (if any) above unicodeCommon - 1.
[5279]45   
[5795]46    UnicodeMappableAlphabet(std::string alphabetName,
47                            unsigned unicodeCommon,
48                            std::vector <UCD::codepoint_t> aboveCommon);
[5279]49   
[5800]50    static inline bool classof(const Alphabet * a) {
51        return a->getClassTypeId() == ClassTypeId::UnicodeMappableAlphabet;
52    }
53    static inline bool classof(const void *) {return false;}
[5279]54    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
[5795]55    UCD::codepoint_t toUnicode(const unsigned n) const;
[5279]56   
57    //  The ordinal position of the character whose Unicode codepoint value is ucp.
[5795]58    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
[5279]59
[5816]60    const unsigned getSize() const override {return mUnicodeCommon + mAboveCommon.size();}
61
[5281]62protected:
[5795]63    UCD::codepoint_t mCharSet;
64    UCD::codepoint_t mUnicodeCommon;
65    std::vector <UCD::codepoint_t> mAboveCommon;
[5279]66};
67
[5795]68class CodeUnitAlphabet : public Alphabet {
69public:
70    CodeUnitAlphabet(std::string alphabetName, uint8_t codeUnitBits);
[5800]71    static inline bool classof(const Alphabet * a) {
72        return a->getClassTypeId() == ClassTypeId::CodeUnitAlphabet;
73    }
74    static inline bool classof(const void *) {return false;}
[5795]75    uint8_t getCodeUnitBitWidth() { return mCodeUnitBits;}
[5816]76    const unsigned getSize() const override {return 1<<mCodeUnitBits;}
77
[5795]78private:
79    uint8_t mCodeUnitBits;
80};
[5279]81
[5795]82//  Some important alphabets are predefined.
[5279]83
[5797]84const extern UnicodeMappableAlphabet Unicode; // Unicode("Unicode", UCD::UNICODE_MAX, {})
[5279]85
[5797]86const extern UnicodeMappableAlphabet ASCII;  // ASCII("ASCII", 0x7F, {});
[5279]87
[5797]88const extern UnicodeMappableAlphabet ISO_Latin1; // ISO_Latin1("ISO_Latin1", 0xFF, {});
[5279]89
[5797]90const extern CodeUnitAlphabet Byte; // Byte("Byte", 8);
91   
92const extern CodeUnitAlphabet UTF16; // UTF16("UTF16", 16);
93   
[5795]94}
[5279]95
96#endif // ALPHABET_H
97
98
Note: See TracBrowser for help on using the repository browser.