Ignore:
Timestamp:
Dec 21, 2017, 3:22:41 PM (17 months ago)
Author:
cameron
Message:

Adding Alphabet to CCs: initial check-in

Location:
icGREP/icgrep-devel/icgrep/cc
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/cc/alphabet.cpp

    r5620 r5795  
    77
    88#include "alphabet.h"
     9#include <llvm/Support/ErrorHandling.h>
    910
     11namespace cc {
     12   
     13UnicodeMappableAlphabet::UnicodeMappableAlphabet(std::string alphabetName,
     14                                                 unsigned unicodeCommon,
     15                                                 std::vector <UCD::codepoint_t> aboveCommon) :
     16    Alphabet(alphabetName), mUnicodeCommon(unicodeCommon), mAboveCommon(aboveCommon) {}
    1017
    11 // Default implementation for simple Unicode subsets.  The codepoint value
    12 // of the nth character is just the given value n, if it is in range.
    13 
    14 UCD::codepoint_t Alphabet::toUnicode(const unsigned n) const {
     18UCD::codepoint_t UnicodeMappableAlphabet::toUnicode(const unsigned n) const {
    1519    UCD::codepoint_t cp = n;
    16     if (mCharSet.contains(cp)) return cp;
    17     throw std::runtime_error("toUnicode: n too large.");
    18 }   
     20    if (n < mUnicodeCommon) return cp;
     21    assert(n < mUnicodeCommon + mAboveCommon.size());
     22    return mAboveCommon[n - mUnicodeCommon];
     23}
    1924 
    20 // Default implementation for simple Unicode subsets.  The ord value
    21 // of a Unicode codepoint is just the given codepoint, if it is in range.
    22 
    23 unsigned Alphabet::fromUnicode(const UCD::codepoint_t codepoint) const {
    24     if (mCharSet.contains(codepoint)) return codepoint;
    25     throw std::runtime_error("fromUnicode: codepoint not found in alphabet.");
     25unsigned UnicodeMappableAlphabet::fromUnicode(const UCD::codepoint_t codepoint) const {
     26    unsigned n = codepoint;
     27    if (n < mUnicodeCommon) return n;
     28    for (unsigned i = 0; i < mAboveCommon.size(); i++) {
     29        if (mAboveCommon[i] == codepoint) return mUnicodeCommon + i;
     30    }
     31    llvm::report_fatal_error("fromUnicode: codepoint not found in alphabet.");
    2632}
    2733
     34CodeUnitAlphabet::CodeUnitAlphabet(std::string alphabetName, uint8_t bits) :
     35    Alphabet(alphabetName), mCodeUnitBits(bits) {}
    2836
    29 template <class uint_t> ExtendedASCII<uint_t>::ExtendedASCII(std::string alphabetName, const uint_t (& extendedTable)[128]) {
    30     mAlphabetName = alphabetName;
    31     mExtendedCharacterTable = extendedTable;
    32     mCharSet = UCD::UnicodeSet(0, 127);
    33     for (unsigned i = 0; i < 128; i++) {
    34         mCharSet.insert(extendedTable[i]);
    35     }
    36 }   
    37 
    38 template <class uint_t> UCD::codepoint_t ExtendedASCII<uint_t>::toUnicode(const unsigned n) const {
    39     //  The first 128 characters are just ASCII.
    40     if (n < 128) return n;
    41     if (n < 256) return mExtendedCharacterTable[n-128];
    42     throw std::runtime_error("toUnicode: n too large.");
    43 }   
    44 
    45 template <class uint_t> unsigned ExtendedASCII<uint_t>::fromUnicode(const UCD::codepoint_t codepoint) const {
    46     if (codepoint < 128) return codepoint;
    47     for (unsigned i = 0; i < 128; i++) {
    48         if (mExtendedCharacterTable[i] == codepoint) return i + 128;
    49     }
    50     throw std::runtime_error("fromUnicode: codepoint not found in alphabet.");
    5137}
    52 
  • icGREP/icgrep-devel/icgrep/cc/alphabet.h

    r5620 r5795  
    1010#include <string>
    1111#include <UCD/unicode_set.h>
     12#include <vector>
    1213
     14namespace cc {
    1315//
    1416// An Alphabet is the universe of characters used to form strings in
     
    1921class Alphabet {
    2022public:
     23    const std::string & getName() const { return mAlphabetName;}
     24protected:
     25    Alphabet(std::string alphabetName) : mAlphabetName(alphabetName) {}
     26private:
     27    std::string mAlphabetName;
     28};
    2129
    22     //  Alphabets may simply be a subset of Unicode characters including all
    23     //  characters up to and including a given maximum Unicode codepoint.
     30class UnicodeMappableAlphabet : public Alphabet {
     31public:
     32    //  Alphabets may be formed by some subset of Unicode characters, together
     33    //  with a mapping to and from Unicode.  The mapping is defined in terms of the
     34    //  number of character codes unicodeCommon such that all character codes in the range
     35    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
     36    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
     37    //  character codes (if any) above unicodeCommon - 1.
    2438   
    25     Alphabet(std::string alphabetName, UCD::codepoint_t maxChar) :
    26         mAlphabetName(alphabetName), mCharSet(UCD::UnicodeSet(0, maxChar)) {}
    27        
    28     const std::string & getName() const { return mAlphabetName;}
    29    
    30     const UCD::UnicodeSet & getSet() const { return mCharSet;}
     39    UnicodeMappableAlphabet(std::string alphabetName,
     40                            unsigned unicodeCommon,
     41                            std::vector <UCD::codepoint_t> aboveCommon);
    3142   
    3243    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
    33     virtual UCD::codepoint_t toUnicode(const unsigned n) const;
     44    UCD::codepoint_t toUnicode(const unsigned n) const;
    3445   
    3546    //  The ordinal position of the character whose Unicode codepoint value is ucp.
    36     virtual unsigned fromUnicode(const UCD::codepoint_t ucp) const;
     47    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
    3748
    3849protected:
    39     std::string mAlphabetName;
    40     UCD::UnicodeSet mCharSet;
     50    UCD::codepoint_t mCharSet;
     51    UCD::codepoint_t mUnicodeCommon;
     52    std::vector <UCD::codepoint_t> mAboveCommon;
    4153};
    4254
    43 
    44 Alphabet Unicode("Unicode", UCD::UNICODE_MAX);
    45 
    46 Alphabet ASCII("ASCII", 0x7F);
    47 
    48 Alphabet ISO_Latin1("ISO_Latin1", 0xFF);
    49 
    50 
    51 // Extended ASCII alphabets can be defined with a table of 128 entries defining
    52 // the codepoints for codes in the 0x80 to 0xFF range.
    53 //
    54 // ExtendedASCII<uint16_t> uses compact tables of 16-bit entries, while
    55 // ExtendedASCII<uint32_t> uses tables of 32-bit entries, necessary if any
    56 // codepoint is above 0xFFFF.
    57 
    58 template <class uint_t> class ExtendedASCII : public Alphabet {
     55class CodeUnitAlphabet : public Alphabet {
    5956public:
    60     ExtendedASCII(std::string alphabetName, const uint_t (& extendedTable)[128]);
    61     UCD::codepoint_t toUnicode(const unsigned n) const final;
    62     unsigned fromUnicode(const UCD::codepoint_t ucp) const final;
     57    CodeUnitAlphabet(std::string alphabetName, uint8_t codeUnitBits);
     58    uint8_t getCodeUnitBitWidth() { return mCodeUnitBits;}
     59   
    6360private:
    64     const uint_t (& mExtendedCharacterTable)[128];
     61    uint8_t mCodeUnitBits;
    6562};
    6663
     64//  Some important alphabets are predefined.
     65
     66const static UnicodeMappableAlphabet Unicode("Unicode", UCD::UNICODE_MAX, {});
     67
     68const static UnicodeMappableAlphabet ASCII("ASCII", 0x7F, {});
     69
     70const static UnicodeMappableAlphabet ISO_Latin1("ISO_Latin1", 0xFF, {});
     71
     72const static CodeUnitAlphabet Byte("Byte", 8);
     73
     74}
    6775
    6876#endif // ALPHABET_H
  • icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp

    r5748 r5795  
    88#include <re/re_cc.h>
    99#include "boost/dynamic_bitset.hpp"
     10#include <cc/multiplex_CCs.h>
     11
     12namespace cc {
    1013
    1114//
     
    104107    }
    105108}
     109
     110
     111
     112MultiplexedAlphabet::MultiplexedAlphabet(std::string alphabetName, const std::vector<const re::CC *> CCs)
     113    : Alphabet(alphabetName) {
     114        cc::doMultiplexCCs(CCs, mExclusiveSetIDs, mMultiplexedCCs);
     115}
     116
     117std::vector<std::vector<unsigned>> MultiplexedAlphabet::getExclusiveSetIDs() {
     118    return mExclusiveSetIDs;
     119}
     120
     121std::vector<re::CC *> MultiplexedAlphabet::getMultiplexedCCs() {
     122    return mMultiplexedCCs;
     123}
     124}
     125
  • icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.h

    r5748 r5795  
    77
    88#include <vector>
     9#include <cc/alphabet.h>
    910
    1011namespace re { class CC; }
    1112
     13namespace cc {
    1214
    13 void doMultiplexCCs(const std::vector<const re::CC *> & CCs,
    14                     std::vector<std::vector<unsigned>> & exclusiveSetIDs,
    15                     std::vector<re::CC *> & multiplexedCCs);
     15class MultiplexedAlphabet : public Alphabet {
     16public:
     17    MultiplexedAlphabet(std::string alphabetName, const std::vector<const re::CC *> CCs);
     18   
     19    std::vector<std::vector<unsigned>> getExclusiveSetIDs();
     20   
     21    std::vector<re::CC *> getMultiplexedCCs();
     22private:
     23    std::vector<std::vector<unsigned>> mExclusiveSetIDs;
     24    std::vector<re::CC *> mMultiplexedCCs;
     25};
     26}
     27
    1628
    1729#endif
Note: See TracChangeset for help on using the changeset viewer.