source: icGREP/icgrep-devel/icgrep/cc/alphabet.h @ 5620

Last change on this file since 5620 was 5620, checked in by nmedfort, 20 months ago

Bug fixes for multigrep mode. Optional PabloKernel? branch hit counter added. Minor optimizations.

File size: 2.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#ifndef ALPHABET_H
8#define ALPHABET_H
9
10#include <string>
11#include <UCD/unicode_set.h>
12
13//
14// An Alphabet is the universe of characters used to form strings in
15// a given language, together with a mapping of those characters to
16// numerical character codes.
17//
18
19class Alphabet {
20public:
21
22    //  Alphabets may simply be a subset of Unicode characters including all
23    //  characters up to and including a given maximum Unicode codepoint.
24   
25    Alphabet(std::string alphabetName, UCD::codepoint_t maxChar) :
26        mAlphabetName(alphabetName), mCharSet(UCD::UnicodeSet(0, maxChar)) {}
27       
28    const std::string & getName() const { return mAlphabetName;}
29   
30    const UCD::UnicodeSet & getSet() const { return mCharSet;}
31   
32    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
33    virtual UCD::codepoint_t toUnicode(const unsigned n) const;
34   
35    //  The ordinal position of the character whose Unicode codepoint value is ucp.
36    virtual unsigned fromUnicode(const UCD::codepoint_t ucp) const;
37
38protected:
39    std::string mAlphabetName;
40    UCD::UnicodeSet mCharSet;
41};
42
43
44Alphabet Unicode("Unicode", UCD::UNICODE_MAX);
45
46Alphabet ASCII("ASCII", 0x7F);
47
48Alphabet ISO_Latin1("ISO_Latin1", 0xFF);
49
50
51// Extended ASCII alphabets can be defined with a table of 128 entries defining
52// the codepoints for codes in the 0x80 to 0xFF range.
53//
54// ExtendedASCII<uint16_t> uses compact tables of 16-bit entries, while
55// ExtendedASCII<uint32_t> uses tables of 32-bit entries, necessary if any
56// codepoint is above 0xFFFF.
57
58template <class uint_t> class ExtendedASCII : public Alphabet {
59public:
60    ExtendedASCII(std::string alphabetName, const uint_t (& extendedTable)[128]);
61    UCD::codepoint_t toUnicode(const unsigned n) const final;
62    unsigned fromUnicode(const UCD::codepoint_t ucp) const final;
63private:
64    const uint_t (& mExtendedCharacterTable)[128];
65};
66
67
68#endif // ALPHABET_H
69
70
Note: See TracBrowser for help on using the repository browser.