Ignore:
Timestamp:
Sep 27, 2015, 1:32:27 AM (4 years ago)
Author:
nmedfort
Message:

Progress on multi-target UCD compiler.

Location:
icGREP/icgrep-devel/icgrep/UCD
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.cpp

    r4736 r4797  
    1111
    1212/** ------------------------------------------------------------------------------------------------------------- *
    13  * @brief generateWithDefaultIfHierarchy
    14  * @param set the unicode set to generate
    15  * @param the entry block to the function we're filling
    16  * @return the output stream with a 1-bit in any position of a character in the unicode set
    17  ** ------------------------------------------------------------------------------------------------------------- */
    18 PabloAST * UCDCompiler::generateWithIfHierarchy(const RangeList & ifRanges, const UnicodeSet & set, PabloBuilder & entry) {
     13 * @brief generateRange
     14 ** ------------------------------------------------------------------------------------------------------------- */
     15void UCDCompiler::generateRange(const RangeList & ifRanges, PabloBuilder & entry) {
    1916    // Pregenerate the suffix var outside of the if ranges. The DCE pass will either eliminate it if it's not used or the
    2017    // code sinking pass will move appropriately into an inner if block.
    21     mSuffixVar = mCharacterClassCompiler.compileCC(makeCC(0x80, 0xBF), entry);
    22     return generateWithIfHierarchy(ifRanges, set, 0, CC::UNICODE_MAX, entry);
    23 }
    24 
    25 /** ------------------------------------------------------------------------------------------------------------- *
    26  * @brief generateCharClassDefsInIfHierarchy
     18    CC *  suffix = makeCC(0x80, 0xBF);
     19    assert (!suffix->empty());
     20    mSuffixVar = mCharacterClassCompiler.compileCC(suffix, entry);
     21    generateRange(ifRanges, 0, CC::UNICODE_MAX, entry);
     22}
     23
     24/** ------------------------------------------------------------------------------------------------------------- *
     25 * @brief generateRange
    2726 * @param ifRangeList
    2827 ** ------------------------------------------------------------------------------------------------------------- */
    29 PabloAST * UCDCompiler::generateWithIfHierarchy(const RangeList & ifRanges, const UnicodeSet & set, const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder) {
    30 
    31     PabloAST * target = builder.createZeroes();
     28void UCDCompiler::generateRange(const RangeList & ifRanges, const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder) {
     29
    3230    // Codepoints in unenclosed ranges will be computed unconditionally.
    3331    // Generate them first so that computed subexpressions may be shared
    3432    // with calculations within the if hierarchy.
    35 
    3633    const auto enclosed = rangeIntersect(ifRanges, lo, hi);
    37 
    3834    for (const auto rg : rangeGaps(enclosed, lo, hi)) {
    39         target = generateSubRanges(set, lo_codepoint(rg), hi_codepoint(rg), builder, target);
     35        generateSubRanges(lo_codepoint(rg), hi_codepoint(rg), builder);
    4036    }
    4137
     
    4339    const auto inner = innerRanges(enclosed);
    4440    for (const auto range : outer) {
    45         codepoint_t lo, hi;
    46         std::tie(lo, hi) = range;
    47         if (set.intersects(lo, hi)) {
     41
     42        TargetVector intersectingTargets;
     43        TargetVector nonIntersectingTargets;
     44
     45        // Split our current target list into two sets: the intersecting and non-intersecting ones. Any non-
     46        // intersecting set will be removed from the current map to eliminate the possibility of it being
     47        // considered until after we leave the current range. The intersecting sets are also stored to ensure
     48        // that we know what the original target value was going into this range block so tha we can OR the
     49        // inner value with the outer value.
     50
     51        for (auto ti = mTargetMap.begin(); ti != mTargetMap.end(); ) {           
     52            if (ti->first->intersects(range.first, range.second)) {
     53                intersectingTargets.emplace_back(ti->first, ti->second);
     54                ++ti;
     55            } else {
     56                nonIntersectingTargets.emplace_back(ti->first, ti->second);
     57                ti = mTargetMap.erase(ti);
     58            }
     59        }
     60        if (mTargetMap.size() > 0) {
     61
    4862            PabloBuilder inner_block = PabloBuilder::Create(builder);
    49             PabloAST * inner_target = generateWithIfHierarchy(inner, set, lo, hi, inner_block);
     63
     64            generateRange(inner, range.first, range.second, inner_block);
     65
     66            std::vector<Assign *> targets;
     67            for (auto ti = intersectingTargets.begin(); ti != intersectingTargets.end(); ) {
     68                auto f = mTargetMap.find(ti->first);
     69                assert (f != mTargetMap.end());
     70                if (LLVM_UNLIKELY(isa<Zeroes>(f->second))) {
     71                    ti = intersectingTargets.erase(ti);
     72                    continue;
     73                }
     74                Assign * escapedValue = inner_block.createAssign("m", f->second);
     75                targets.push_back(escapedValue);
     76                f->second = escapedValue;
     77                ++ti;
     78            }
     79
    5080            // If this range is empty, just skip creating the if block
    51             if (LLVM_UNLIKELY(isa<Zeroes>(inner_target))) {
    52                 continue;
    53             }
    54             Assign * matches = inner_block.createAssign("m", inner_target);
    55             builder.createIf(ifTestCompiler(lo, hi, builder), {matches}, inner_block);
    56             target = builder.createOr(target, matches);
    57         }
    58     }
    59 
    60     return target;
    61 }
    62 
    63 /** ------------------------------------------------------------------------------------------------------------- *
    64  * @brief generateCharClassSubDefs
    65  * @param ifRangeList
    66  ** ------------------------------------------------------------------------------------------------------------- */
    67 PabloAST * UCDCompiler::generateSubRanges(const UnicodeSet & set, const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder, PabloAST * target) {
    68     const auto range = rangeIntersect(set, lo, hi);
    69     // Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
    70     const std::array<interval_t, 9> ranges =
    71         {{{0, 0x7F}, {0x80, 0x7FF}, {0x800, 0xFFF}, {0x1000, 0xD7FF}, {0xD800, 0xDFFF},
    72          {0xE000, 0xFFFF}, {0x10000, 0x3FFFF}, {0x40000, 0xFFFFF}, {0x100000, 0x10FFFF}}};
    73     for (auto r : ranges) {
    74         const auto subrange = rangeIntersect(range, lo_codepoint(r), hi_codepoint(r));
    75         target = sequenceGenerator(std::move(subrange), 1, builder, target, nullptr);
    76     }
    77     return target;
     81            if (targets.size() > 0) {
     82                builder.createIf(ifTestCompiler(range.first, range.second, builder), std::move(targets), inner_block);
     83                for (const auto ti : intersectingTargets) {
     84                    auto f = mTargetMap.find(ti.first);
     85                    assert (f != mTargetMap.end());
     86                    assert (isa<Assign>(f->second));
     87                    f->second = builder.createOr(ti.second, f->second);
     88                }
     89            }
     90        }
     91        for (Target t : nonIntersectingTargets) {
     92            mTargetMap.insert(t);
     93        }
     94    }
     95}
     96
     97/** ------------------------------------------------------------------------------------------------------------- *
     98 * @brief generateSubRanges
     99 ** ------------------------------------------------------------------------------------------------------------- */
     100void UCDCompiler::generateSubRanges(const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder) {
     101    for (Target & t : mTargetMap) {
     102        const auto range = rangeIntersect(*t.first, lo, hi);
     103        PabloAST * target = t.second;
     104        // Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
     105        const std::array<interval_t, 9> ranges =
     106            {{{0, 0x7F}, {0x80, 0x7FF}, {0x800, 0xFFF}, {0x1000, 0xD7FF}, {0xD800, 0xDFFF},
     107             {0xE000, 0xFFFF}, {0x10000, 0x3FFFF}, {0x40000, 0xFFFFF}, {0x100000, 0x10FFFF}}};
     108        for (auto r : ranges) {
     109            const auto subrange = rangeIntersect(range, lo_codepoint(r), hi_codepoint(r));
     110            target = sequenceGenerator(std::move(subrange), 1, builder, target, nullptr);
     111        }
     112        t.second = target;
     113    }
    78114}
    79115
     
    88124PabloAST * UCDCompiler::sequenceGenerator(const RangeList && ranges, const unsigned byte_no, PabloBuilder & builder, PabloAST * target, PabloAST * prefix) {
    89125
    90     if (LLVM_LIKELY(!ranges.empty())) {
     126    if (LLVM_LIKELY(ranges.size() > 0)) {
    91127
    92128        codepoint_t lo, hi;
     
    100136            target = sequenceGenerator(std::move(rangeIntersect(ranges, lo, mid)), byte_no, builder, target, prefix);
    101137            target = sequenceGenerator(std::move(rangeIntersect(ranges, mid + 1, hi)), byte_no, builder, target, prefix);
    102         }
    103         else if (min == byte_no) {
     138        } else if (min == byte_no) {
    104139            // We have a single byte remaining to match for all code points in this CC.
    105140            // Use the byte class compiler to generate matches for these codepoints.
     
    110145            }
    111146            target = builder.createOr(target, var);
    112         }
    113         else {
     147        } else {
    114148            for (auto rg : ranges) {
    115149                codepoint_t lo, hi;
     
    122156                        target = sequenceGenerator(lo, mid, byte_no, builder, target, prefix);
    123157                        target = sequenceGenerator(mid + 1, hi, byte_no, builder, target, prefix);
    124                     }
    125                     else if (!UTF8_Encoder::isHighCodePointAfterByte(hi, byte_no)) {
     158                    } else if (!UTF8_Encoder::isHighCodePointAfterByte(hi, byte_no)) {
    126159                        const codepoint_t mid = hi & ~((1 << (6 * (min - byte_no))) - 1);
    127160                        target = sequenceGenerator(lo, mid - 1, byte_no, builder, target, prefix);
    128161                        target = sequenceGenerator(mid, hi, byte_no, builder, target, prefix);
    129                     }
    130                     else { // we have a prefix group of type (a)
     162                    } else { // we have a prefix group of type (a)
    131163                        PabloAST * var = mCharacterClassCompiler.compileCC(makeCC(lo_byte, hi_byte), builder);
    132164                        if (byte_no > 1) {
     
    185217        PabloAST * cc = mCharacterClassCompiler.compileCC(makeCC(lo_byte, hi_byte), builder);
    186218        target = builder.createAnd(cc, target);
    187     }
    188     else if (lo_byte == hi_byte) {
     219    } else if (lo_byte == hi_byte) {
    189220        PabloAST * cc = mCharacterClassCompiler.compileCC(makeCC(lo_byte, hi_byte), builder);
    190221        target = builder.createAnd(cc, target);
    191222        target = builder.createAdvance(target, 1);
    192223        target = ifTestCompiler(lo, hi, byte_no + 1, builder, target);
    193     }
    194     else if (!at_hi_boundary) {
     224    } else if (!at_hi_boundary) {
    195225        const auto mid = UTF8_Encoder::minCodePointWithCommonBytes(hi, byte_no);
    196226        PabloAST * e1 = ifTestCompiler(lo, mid - 1, byte_no, builder, target);
    197227        PabloAST * e2 = ifTestCompiler(mid, hi, byte_no, builder, target);
    198228        target = builder.createOr(e1, e2);
    199     }
    200     else {
     229    } else {
    201230        const auto mid = UTF8_Encoder::maxCodePointWithCommonBytes(lo, byte_no);
    202231        PabloAST * e1 = ifTestCompiler(lo, mid, byte_no, builder, target);
     
    272301        if (LLVM_UNLIKELY(list.empty())) {
    273302            gaps.emplace_back(lo, hi);
    274         }
    275         else {
     303        } else {
    276304            codepoint_t cp = lo;
    277305            for (const auto & i : list) {
    278306                if (hi_codepoint(i) < cp) {
    279307                    continue;
    280                 }
    281                 else if (lo_codepoint(i) > cp) {
     308                } else if (lo_codepoint(i) > cp) {
    282309                    gaps.emplace_back(cp, lo_codepoint(i) - 1);
    283                 }
    284                 else if (hi_codepoint(i) >= hi) {
     310                } else if (hi_codepoint(i) >= hi) {
    285311                    continue;
    286312                }
     
    315341/** ------------------------------------------------------------------------------------------------------------- *
    316342 * @brief innerRanges
    317  * @param list
    318343 ** ------------------------------------------------------------------------------------------------------------- */
    319344UCDCompiler::RangeList UCDCompiler::innerRanges(const RangeList & list) {
     
    323348            if (hi_codepoint(*j) <= hi_codepoint(*i)) {
    324349                ranges.emplace_back(lo_codepoint(*j), hi_codepoint(*j));
    325             }
    326             else {
     350            } else {
    327351                i = j;
    328352            }
     
    330354    }
    331355    return ranges;
     356}
     357
     358/** ------------------------------------------------------------------------------------------------------------- *
     359 * @brief addTarget
     360 ** ------------------------------------------------------------------------------------------------------------- */
     361inline void UCDCompiler::addTarget(const UnicodeSet & set) {
     362#ifdef USE_BOOST
     363    mTargetMap.emplace(&set, PabloBlock::createZeroes());
     364#else
     365    mTargetMap.insert(std::make_pair(&set, PabloBlock::createZeroes()));
     366#endif
    332367}
    333368
     
    427462        {0x10000, 0x10FFFF}};
    428463
    429     return generateWithIfHierarchy(defaultIfHierachy, set, entry);
     464    addTarget(set);
     465    generateRange(defaultIfHierachy, entry);
     466    return mTargetMap[&set];
    430467}
    431468
     
    437474 ** ------------------------------------------------------------------------------------------------------------- */
    438475PabloAST * UCDCompiler::generateWithoutIfHierarchy(const UnicodeSet & set, PabloBuilder & entry) {
    439 
    440476    const RangeList defaultIfHierachy = {{0x10000, 0x10FFFF}};
    441 
    442     return generateWithIfHierarchy(defaultIfHierachy, set, entry);
     477    addTarget(set);
     478    generateRange(defaultIfHierachy, entry);
     479    return mTargetMap[&set];
    443480}
    444481
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.hpp

    r4736 r4797  
    22#define UCDCOMPILER_HPP
    33
     4#include <re/re_cc.h>
    45#include <vector>
    5 #include <re/re_cc.h>
     6#ifdef USE_BOOST
     7#include <boost/container/flat_map.hpp>
     8#else
     9#include <unordered_map>
     10#endif
    611
    712namespace cc {
     
    2530    using codepoint_t = re::codepoint_t;
    2631    using RangeList = std::vector<re::interval_t>;
     32    #ifdef USE_BOOST
     33    using TargetMap = boost::container::flat_map<const UnicodeSet *, PabloAST *>;
     34    #else
     35    using TargetMap = std::unordered_map<const UnicodeSet *, PabloAST *>;
     36    #endif
     37    using Target = TargetMap::value_type;
     38    using TargetVector = std::vector<Target>;
    2739
    2840public:
     
    3345    PabloAST * generateWithoutIfHierarchy(const UnicodeSet & set, PabloBuilder & entry);
    3446
    35     PabloAST * generateWithIfHierarchy(const RangeList & ifRanges, const UnicodeSet & set, PabloBuilder & entry);
    36 
    3747protected:
    3848
    39     PabloAST * generateWithIfHierarchy(const RangeList & ifRanges, const UnicodeSet & set, const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder);
     49    void generateRange(const RangeList & ifRanges, PabloBuilder & entry);
    4050
    41     PabloAST * generateSubRanges(const UnicodeSet & set, const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder, PabloAST * target);
     51    void generateRange(const RangeList & ifRanges, const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder);
     52
     53    void generateSubRanges(const codepoint_t lo, const codepoint_t hi, PabloBuilder & builder);
    4254
    4355    PabloAST * sequenceGenerator(const RangeList && ranges, const unsigned byte_no, PabloBuilder & builder, PabloAST * target, PabloAST * prefix);
     
    5062
    5163    PabloAST * makePrefix(const codepoint_t cp, const unsigned byte_no, PabloBuilder & builder, PabloAST * prefix);
     64
     65    void addTarget(const UnicodeSet & set);
    5266
    5367    static RangeList byteDefinitions(const RangeList & list, const unsigned byte_no);
     
    6579    cc::CC_Compiler &       mCharacterClassCompiler;
    6680    PabloAST *              mSuffixVar;
     81    TargetMap               mTargetMap;
    6782};
    6883
Note: See TracChangeset for help on using the changeset viewer.