source: icGREP/icgrep-devel/icgrep/cc/alphabet.h @ 5789

Last change on this file since 5789 was 5620, checked in by nmedfort, 2 years ago

Bug fixes for multigrep mode. Optional PabloKernel? branch hit counter added. Minor optimizations.

File size: 2.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef ALPHABET_H
8#define ALPHABET_H
9
10#include <string>
11#include <UCD/unicode_set.h>
12
13//
14// An Alphabet is the universe of characters used to form strings in
15// a given language, together with a mapping of those characters to
16// numerical character codes.
17//
18
19class Alphabet {
20public:
21
22    //  Alphabets may simply be a subset of Unicode characters including all
23    //  characters up to and including a given maximum Unicode codepoint.
24   
25    Alphabet(std::string alphabetName, UCD::codepoint_t maxChar) :
26        mAlphabetName(alphabetName), mCharSet(UCD::UnicodeSet(0, maxChar)) {}
27       
28    const std::string & getName() const { return mAlphabetName;}
29   
30    const UCD::UnicodeSet & getSet() const { return mCharSet;}
31   
32    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
33    virtual UCD::codepoint_t toUnicode(const unsigned n) const;
34   
35    //  The ordinal position of the character whose Unicode codepoint value is ucp.
36    virtual unsigned fromUnicode(const UCD::codepoint_t ucp) const;
37
38protected:
39    std::string mAlphabetName;
40    UCD::UnicodeSet mCharSet;
41};
42
43
44Alphabet Unicode("Unicode", UCD::UNICODE_MAX);
45
46Alphabet ASCII("ASCII", 0x7F);
47
48Alphabet ISO_Latin1("ISO_Latin1", 0xFF);
49
50
51// Extended ASCII alphabets can be defined with a table of 128 entries defining
52// the codepoints for codes in the 0x80 to 0xFF range.
53//
54// ExtendedASCII<uint16_t> uses compact tables of 16-bit entries, while
55// ExtendedASCII<uint32_t> uses tables of 32-bit entries, necessary if any
56// codepoint is above 0xFFFF.
57
58template <class uint_t> class ExtendedASCII : public Alphabet {
59public:
60    ExtendedASCII(std::string alphabetName, const uint_t (& extendedTable)[128]);
61    UCD::codepoint_t toUnicode(const unsigned n) const final;
62    unsigned fromUnicode(const UCD::codepoint_t ucp) const final;
63private:
64    const uint_t (& mExtendedCharacterTable)[128];
65};
66
67
68#endif // ALPHABET_H
69
70
Note: See TracBrowser for help on using the repository browser.