Ignore:
Timestamp:
Dec 21, 2017, 3:22:41 PM (15 months ago)
Author:
cameron
Message:

Adding Alphabet to CCs: initial check-in

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/cc/alphabet.h

    r5620 r5795  
    1010#include <string>
    1111#include <UCD/unicode_set.h>
     12#include <vector>
    1213
     14namespace cc {
    1315//
    1416// An Alphabet is the universe of characters used to form strings in
     
    1921class Alphabet {
    2022public:
     23    const std::string & getName() const { return mAlphabetName;}
     24protected:
     25    Alphabet(std::string alphabetName) : mAlphabetName(alphabetName) {}
     26private:
     27    std::string mAlphabetName;
     28};
    2129
    22     //  Alphabets may simply be a subset of Unicode characters including all
    23     //  characters up to and including a given maximum Unicode codepoint.
     30class UnicodeMappableAlphabet : public Alphabet {
     31public:
     32    //  Alphabets may be formed by some subset of Unicode characters, together
     33    //  with a mapping to and from Unicode.  The mapping is defined in terms of the
     34    //  number of character codes unicodeCommon such that all character codes in the range
     35    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
     36    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
     37    //  character codes (if any) above unicodeCommon - 1.
    2438   
    25     Alphabet(std::string alphabetName, UCD::codepoint_t maxChar) :
    26         mAlphabetName(alphabetName), mCharSet(UCD::UnicodeSet(0, maxChar)) {}
    27        
    28     const std::string & getName() const { return mAlphabetName;}
    29    
    30     const UCD::UnicodeSet & getSet() const { return mCharSet;}
     39    UnicodeMappableAlphabet(std::string alphabetName,
     40                            unsigned unicodeCommon,
     41                            std::vector <UCD::codepoint_t> aboveCommon);
    3142   
    3243    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
    33     virtual UCD::codepoint_t toUnicode(const unsigned n) const;
     44    UCD::codepoint_t toUnicode(const unsigned n) const;
    3445   
    3546    //  The ordinal position of the character whose Unicode codepoint value is ucp.
    36     virtual unsigned fromUnicode(const UCD::codepoint_t ucp) const;
     47    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
    3748
    3849protected:
    39     std::string mAlphabetName;
    40     UCD::UnicodeSet mCharSet;
     50    UCD::codepoint_t mCharSet;
     51    UCD::codepoint_t mUnicodeCommon;
     52    std::vector <UCD::codepoint_t> mAboveCommon;
    4153};
    4254
    43 
    44 Alphabet Unicode("Unicode", UCD::UNICODE_MAX);
    45 
    46 Alphabet ASCII("ASCII", 0x7F);
    47 
    48 Alphabet ISO_Latin1("ISO_Latin1", 0xFF);
    49 
    50 
    51 // Extended ASCII alphabets can be defined with a table of 128 entries defining
    52 // the codepoints for codes in the 0x80 to 0xFF range.
    53 //
    54 // ExtendedASCII<uint16_t> uses compact tables of 16-bit entries, while
    55 // ExtendedASCII<uint32_t> uses tables of 32-bit entries, necessary if any
    56 // codepoint is above 0xFFFF.
    57 
    58 template <class uint_t> class ExtendedASCII : public Alphabet {
     55class CodeUnitAlphabet : public Alphabet {
    5956public:
    60     ExtendedASCII(std::string alphabetName, const uint_t (& extendedTable)[128]);
    61     UCD::codepoint_t toUnicode(const unsigned n) const final;
    62     unsigned fromUnicode(const UCD::codepoint_t ucp) const final;
     57    CodeUnitAlphabet(std::string alphabetName, uint8_t codeUnitBits);
     58    uint8_t getCodeUnitBitWidth() { return mCodeUnitBits;}
     59   
    6360private:
    64     const uint_t (& mExtendedCharacterTable)[128];
     61    uint8_t mCodeUnitBits;
    6562};
    6663
     64//  Some important alphabets are predefined.
     65
     66const static UnicodeMappableAlphabet Unicode("Unicode", UCD::UNICODE_MAX, {});
     67
     68const static UnicodeMappableAlphabet ASCII("ASCII", 0x7F, {});
     69
     70const static UnicodeMappableAlphabet ISO_Latin1("ISO_Latin1", 0xFF, {});
     71
     72const static CodeUnitAlphabet Byte("Byte", 8);
     73
     74}
    6775
    6876#endif // ALPHABET_H
Note: See TracChangeset for help on using the changeset viewer.