Ignore:
Timestamp:
Jun 9, 2016, 3:34:07 PM (3 years ago)
Author:
xuedongx
Message:

Support over UTF-16 representation of Unicode

Location:
icGREP/icgrep-devel/icgrep/UCD
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.cpp

    r4860 r5045  
    44#include <re/re_name.h>
    55#include <utf8_encoder.h>
     6#include <utf16_encoder.h>
     7#include <iostream>
    68
    79using namespace cc;
     
    1012
    1113namespace UCD {
     14
     15/** ------------------------------------------------------------------------------------------------------------- *
     16 * @brief UTF_16 UTF_8
     17 ** ------------------------------------------------------------------------------------------------------------- */
     18inline codepoint_t encodingByte(const codepoint_t cp, const unsigned n, bool UTF_16) {
     19        return UTF_16 ? UTF16_Encoder::encodingByte(cp, n) : UTF8_Encoder::encodingByte(cp, n);
     20}
     21
     22inline unsigned length(const codepoint_t cp, bool UTF_16) {
     23        return UTF_16 ? UTF16_Encoder::length(cp) : UTF8_Encoder::length(cp);
     24}
     25
     26inline codepoint_t maxCodePoint(const unsigned length, bool UTF_16) {
     27        return UTF_16 ?  UTF16_Encoder::maxCodePoint(length) : UTF8_Encoder::maxCodePoint(length);
     28}
     29
     30inline bool isLowCodePointAfterByte(const codepoint_t cp, const unsigned n, bool UTF_16) {
     31        return UTF_16 ? UTF16_Encoder::isLowCodePointAfterByte(cp, n) : UTF8_Encoder::isLowCodePointAfterByte(cp, n);
     32}
     33inline bool isHighCodePointAfterByte(const codepoint_t cp, const unsigned n, bool UTF_16) {
     34        return UTF_16 ? UTF16_Encoder::isHighCodePointAfterByte(cp, n) : UTF8_Encoder::isHighCodePointAfterByte(cp, n);
     35}
     36inline codepoint_t minCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n, bool UTF_16) {
     37        return UTF_16 ? UTF16_Encoder::minCodePointWithCommonBytes(cp, n) : UTF8_Encoder::minCodePointWithCommonBytes(cp, n);
     38}
     39inline codepoint_t maxCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n, bool UTF_16) {
     40        return UTF_16 ? UTF16_Encoder::maxCodePointWithCommonBytes(cp, n) : UTF8_Encoder::maxCodePointWithCommonBytes(cp, n);
     41}
    1242
    1343const UCDCompiler::RangeList UCDCompiler::defaultIfHierachy = {
     
    213243 ** ------------------------------------------------------------------------------------------------------------- */
    214244PabloAST * UCDCompiler::sequenceGenerator(const RangeList && ranges, const unsigned byte_no, PabloBuilder & builder, PabloAST * target, PabloAST * prefix) {
     245        bool isUTF_16 = mCharacterClassCompiler.isUTF_16();
    215246
    216247    if (LLVM_LIKELY(ranges.size() > 0)) {
     
    219250        std::tie(lo, hi) = ranges[0];
    220251
    221         const auto min = UTF8_Encoder::length(lo_codepoint(ranges.front()));
    222         const auto max = UTF8_Encoder::length(hi_codepoint(ranges.back()));
     252        const auto min = length(lo_codepoint(ranges.front()), isUTF_16);
     253        const auto max = length(hi_codepoint(ranges.back()), isUTF_16);
    223254
    224255        if (min != max) {
    225             const auto mid = UTF8_Encoder::maxCodePoint(min);
     256            const auto mid = maxCodePoint(min, isUTF_16);
    226257            target = sequenceGenerator(std::move(rangeIntersect(ranges, lo, mid)), byte_no, builder, target, prefix);
    227258            target = sequenceGenerator(std::move(rangeIntersect(ranges, mid + 1, hi)), byte_no, builder, target, prefix);
     
    229260            // We have a single byte remaining to match for all code points in this CC.
    230261            // Use the byte class compiler to generate matches for these codepoints.
    231             PabloAST * var = mCharacterClassCompiler.compileCC(makeCC(byteDefinitions(ranges, byte_no)), builder);
     262            PabloAST * var = mCharacterClassCompiler.compileCC(makeCC(byteDefinitions(ranges, byte_no, isUTF_16)), builder);
    232263            if (byte_no > 1) {
    233264                var = builder.createAnd(var, builder.createAdvance(makePrefix(lo, byte_no, builder, prefix), 1));
     
    238269                codepoint_t lo, hi;
    239270                std::tie(lo, hi) = rg;
    240                 const auto lo_byte = UTF8_Encoder::encodingByte(lo, byte_no);
    241                 const auto hi_byte = UTF8_Encoder::encodingByte(hi, byte_no);
    242                 if (lo_byte != hi_byte) {
    243                     if (!UTF8_Encoder::isLowCodePointAfterByte(lo, byte_no)) {
    244                         const codepoint_t mid = lo | ((1 << (6 * (min - byte_no))) - 1);
     271                const auto lo_byte = encodingByte(lo, byte_no, isUTF_16);
     272                const auto hi_byte = encodingByte(hi, byte_no, isUTF_16);
     273                //std::cout << "lo_byte: " << std::hex << lo_byte << " hi_byte: " << std::hex << hi_byte << std::endl;
     274                                if (lo_byte != hi_byte) {
     275                                        unsigned num = isUTF_16 ? 10 : 6;
     276                    if (!isLowCodePointAfterByte(lo, byte_no, isUTF_16)) {
     277                        const codepoint_t mid = lo | ((1 << (num * (min - byte_no))) - 1);
    245278                        target = sequenceGenerator(lo, mid, byte_no, builder, target, prefix);
    246279                        target = sequenceGenerator(mid + 1, hi, byte_no, builder, target, prefix);
    247                     } else if (!UTF8_Encoder::isHighCodePointAfterByte(hi, byte_no)) {
    248                         const codepoint_t mid = hi & ~((1 << (6 * (min - byte_no))) - 1);
     280                    } else if (!isHighCodePointAfterByte(hi, byte_no, isUTF_16)) {
     281                        const codepoint_t mid = hi & ~((1 << (num * (min - byte_no))) - 1);
    249282                        target = sequenceGenerator(lo, mid - 1, byte_no, builder, target, prefix);
    250283                        target = sequenceGenerator(mid, hi, byte_no, builder, target, prefix);
     
    254287                            var = builder.createAnd(builder.createAdvance(prefix, 1), var);
    255288                        }
    256                         for (unsigned i = byte_no; i != UTF8_Encoder::length(lo); ++i) {
     289                        for (unsigned i = byte_no; i != length(lo, isUTF_16); ++i) {
    257290                            var = builder.createAnd(mSuffixVar, builder.createAdvance(var, 1));
    258291                        }
     
    265298                        var = builder.createAnd(builder.createAdvance(prefix ? prefix : var, 1), var);
    266299                    }
    267                     if (byte_no < UTF8_Encoder::length(lo)) {
     300                    if (byte_no < length(lo, isUTF_16)) {
    268301                        target = sequenceGenerator(lo, hi, byte_no + 1, builder, target, var);
    269302                    }
     
    294327PabloAST * UCDCompiler::ifTestCompiler(const codepoint_t lo, const codepoint_t hi, const unsigned byte_no, PabloBuilder & builder, PabloAST * target) {
    295328
    296     codepoint_t lo_byte = UTF8_Encoder::encodingByte(lo, byte_no);
    297     codepoint_t hi_byte = UTF8_Encoder::encodingByte(hi, byte_no);
    298     const bool at_lo_boundary = (lo == 0 || UTF8_Encoder::encodingByte(lo - 1, byte_no) != lo_byte);
    299     const bool at_hi_boundary = (hi == 0x10FFFF || UTF8_Encoder::encodingByte(hi + 1, byte_no) != hi_byte);
     329        bool isUTF_16 = mCharacterClassCompiler.isUTF_16();
     330    codepoint_t lo_byte = encodingByte(lo, byte_no, isUTF_16);
     331    codepoint_t hi_byte = encodingByte(hi, byte_no, isUTF_16);
     332    const bool at_lo_boundary = (lo == 0 || encodingByte(lo - 1, byte_no, isUTF_16) != lo_byte);
     333    const bool at_hi_boundary = (hi == 0x10FFFF || encodingByte(hi + 1, byte_no, isUTF_16) != hi_byte);
    300334
    301335    if (at_lo_boundary && at_hi_boundary) {
    302         if (lo_byte != hi_byte) {
    303             if (lo == 0x80) lo_byte = 0xC0;
    304             if (hi == 0x10FFFF) hi_byte = 0xFF;
    305         }
     336                if (!isUTF_16) {
     337                        if (lo_byte != hi_byte) {
     338                                if (lo == 0x80) lo_byte = 0xC0;
     339                                if (hi == 0x10FFFF) hi_byte = 0xFF;
     340                        }
     341                }
    306342        PabloAST * cc = mCharacterClassCompiler.compileCC(makeCC(lo_byte, hi_byte), builder);
    307343        target = builder.createAnd(cc, target);
     
    312348        target = ifTestCompiler(lo, hi, byte_no + 1, builder, target);
    313349    } else if (!at_hi_boundary) {
    314         const auto mid = UTF8_Encoder::minCodePointWithCommonBytes(hi, byte_no);
     350        const auto mid = minCodePointWithCommonBytes(hi, byte_no, isUTF_16);
    315351        PabloAST * e1 = ifTestCompiler(lo, mid - 1, byte_no, builder, target);
    316352        PabloAST * e2 = ifTestCompiler(mid, hi, byte_no, builder, target);
    317353        target = builder.createOr(e1, e2);
    318354    } else {
    319         const auto mid = UTF8_Encoder::maxCodePointWithCommonBytes(lo, byte_no);
     355        const auto mid = maxCodePointWithCommonBytes(lo, byte_no, isUTF_16);
    320356        PabloAST * e1 = ifTestCompiler(lo, mid, byte_no, builder, target);
    321357        PabloAST * e2 = ifTestCompiler(mid + 1, hi, byte_no, builder, target);
     
    335371    assert (byte_no >= 1 && byte_no <= 4);
    336372    assert (byte_no == 1 || prefix != nullptr);
     373        bool isUTF_16 = mCharacterClassCompiler.isUTF_16();
    337374    for (unsigned i = 1; i != byte_no; ++i) {
    338         const CC * const cc = makeCC(UTF8_Encoder::encodingByte(cp, i));
     375        const CC * const cc = makeCC(encodingByte(cp, i, isUTF_16));
    339376        PabloAST * var = mCharacterClassCompiler.compileCC(cc, builder);
    340377        if (i > 1) {
     
    353390 * Ensure the sequence of preceding bytes is defined, up to, but not including the given byte_no
    354391 ** ------------------------------------------------------------------------------------------------------------- */
    355 UCDCompiler::RangeList UCDCompiler::byteDefinitions(const RangeList & list, const unsigned byte_no) {
     392UCDCompiler::RangeList UCDCompiler::byteDefinitions(const RangeList & list, const unsigned byte_no, bool isUTF_16) {
    356393    RangeList result;
    357394    result.reserve(list.size());
    358395    for (const auto & i : list) {
    359         result.emplace_back(UTF8_Encoder::encodingByte(lo_codepoint(i), byte_no), UTF8_Encoder::encodingByte(hi_codepoint(i), byte_no));
     396        result.emplace_back(encodingByte(lo_codepoint(i), byte_no, isUTF_16), encodingByte(hi_codepoint(i), byte_no, isUTF_16));
    360397    }
    361398    return result;
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.hpp

    r4991 r5045  
    6969    PabloAST * makePrefix(const codepoint_t cp, const unsigned byte_no, PabloBuilder & builder, PabloAST * prefix);
    7070
    71     static RangeList byteDefinitions(const RangeList & list, const unsigned byte_no);
     71    static RangeList byteDefinitions(const RangeList & list, const unsigned byte_no, bool isUTF_16);
    7272
    7373    template <typename RangeListOrUnicodeSet>
Note: See TracChangeset for help on using the changeset viewer.