Ignore:
Timestamp:
Oct 3, 2015, 3:31:16 PM (4 years ago)
Author:
nmedfort
Message:

Added union/diff/intersection functionality to RE_Compiler. Removed toUTF8 pass in favour of using the UCD_Compiler.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/utf8_encoder.cpp

    r4665 r4814  
    66
    77#include <utf8_encoder.h>
    8 #include <re/re_name.h>
    9 #include <re/re_seq.h>
    10 #include <re/re_alt.h>
    11 #include <re/re_rep.h>
    12 #include <cc/cc_namemap.hpp>
    138#include <assert.h>
    149#include <algorithm>
     
    1813
    1914namespace cc {
    20 
    21 RE * UTF8_Encoder::toUTF8(CC_NameMap & nameMap, RE * ast) {
    22     for (Name * name : nameMap) {
    23         if (const CC * cc = dyn_cast_or_null<CC>(name->getDefinition())) {
    24             if (cc->size() == 1) {
    25                 name->setDefinition(rangeToUTF8(cc->front()));
    26             }
    27             else if (cc->size() > 1) {
    28                 std::vector<RE *> alt;
    29                 for (const interval_t & i : *cc) {
    30                     alt.push_back(rangeToUTF8(i));
    31                 }
    32                 name->setDefinition(makeAlt(alt.begin(), alt.end()));
    33             }
    34         }
    35     }
    36     return nameMap.process(ast, ByteClass);
    37 }
    38 
    39 RE * UTF8_Encoder::rangeToUTF8(const interval_t & item) {
    40     const auto min = length(lo_codepoint(item));
    41     const auto max = length(hi_codepoint(item));
    42     if (min < max) {
    43         const auto m = maxCodePoint(min);
    44         return makeAlt({rangeToUTF8(std::make_pair(lo_codepoint(item), m)), rangeToUTF8(std::make_pair(m + 1, hi_codepoint(item)))});
    45     }
    46     else {
    47         return rangeToUTF8(lo_codepoint(item), hi_codepoint(item), 1, max);
    48     }
    49 }
    50 
    51 RE * UTF8_Encoder::rangeToUTF8(const codepoint_t lo, const codepoint_t hi, const unsigned index, const unsigned max)
    52 {
    53     const codepoint_t hbyte = encodingByte(hi, index);
    54     const codepoint_t lbyte = encodingByte(lo, index);
    55     if (index == max) {
    56         return makeByteRange(lbyte, hbyte);
    57     }
    58     else if (hbyte == lbyte) {
    59         return makeSeq({makeByteClass(hbyte), rangeToUTF8(lo, hi, index + 1, max)});
    60     }
    61     else {
    62         const unsigned suffix_mask = (static_cast<unsigned>(1) << ((max - index) * 6)) - 1;
    63         if ((hi & suffix_mask) != suffix_mask) {
    64             const unsigned hi_floor = (~suffix_mask) & hi;
    65             return makeAlt({rangeToUTF8(hi_floor, hi, index, max), rangeToUTF8(lo, hi_floor - 1, index, max)});
    66         }
    67         else if ((lo & suffix_mask) != 0) {
    68             const unsigned low_ceil = lo | suffix_mask;
    69             return makeAlt({rangeToUTF8(low_ceil + 1, hi, index, max), rangeToUTF8(lo, low_ceil, index, max)});
    70         }
    71         else {
    72             return makeSeq({makeByteRange(lbyte, hbyte), rangeToUTF8(lo, hi, index + 1, max)});
    73         }
    74     }
    75 }
    7615
    7716bool UTF8_Encoder::isPrefix(const codepoint_t cp) {
     
    16099}
    161100
    162 inline CC * UTF8_Encoder::makeByteRange(const codepoint_t lo, const codepoint_t hi) {
    163     return makeCC(lo, hi);
    164101}
    165 
    166 inline CC * UTF8_Encoder::makeByteClass(const codepoint_t cp) {
    167     return makeCC(cp, cp);
    168 }
    169 
    170 }
Note: See TracChangeset for help on using the changeset viewer.