source: icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp @ 5799

Last change on this file since 5799 was 5799, checked in by cameron, 16 months ago

sourceAlphabet and transformCC

File size: 5.7 KB
RevLine 
[5369]1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include <map>
7#include <UCD/unicode_set.h>
8#include <re/re_cc.h>
9#include "boost/dynamic_bitset.hpp"
[5795]10#include <cc/multiplex_CCs.h>
[5799]11#include <llvm/Support/Casting.h>
12#include <llvm/Support/ErrorHandling.h>
[5369]13
[5795]14namespace cc {
15
[5369]16//
17// Breakpoints of a set of character classes (CCs): each codepoint c such that
18// there is some CC in CCs such that either (a) c is in the CC and c-1 is not, or
[5741]19// (b) c-1 is in the CC and c is not.  Boundary cases: if codepoint 0 is in
20// some CC, then 0 is a breakpoint (codepoint -1 is not in any CC).  If codepoint
21// 0x10FFFF is in some CC then 0x110000 is a breakpoint.
[5369]22//
23// The breakpoints may be determined by iterating through the interval
24// representation of each CC.   For each interval (lo, hi), lo and hi+1
[5727]25// are breakpoints.
[5369]26//
27// For each breakpoint, a bitset is computed identifying the source CCs whose
28// status changes at the breakpoint.
29//
30
[5748]31std::map<UCD::codepoint_t, boost::dynamic_bitset<>> computeBreakpoints(const std::vector<const re::CC *> & CCs) {
[5369]32    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints;
33    for (unsigned i = 0; i < CCs.size(); i++) {
[5748]34        for (const auto range : *CCs[i]) {
35            const auto lo = re::lo_codepoint(range);
36            const auto hi = re::hi_codepoint(range);
[5369]37            auto f = breakpoints.find(lo);
38            if (f == breakpoints.end()) {
39                breakpoints.emplace(lo, boost::dynamic_bitset<>(CCs.size()));
40            }
41            breakpoints[lo].set(i);
42            f = breakpoints.find(hi + 1);
43            if (f == breakpoints.end()) {
44                breakpoints.emplace(hi+1, boost::dynamic_bitset<>(CCs.size()));
45            }
46            breakpoints[hi+1].set(i);
47        }
48    }
49    return breakpoints;
50}
51
52
[5748]53void doMultiplexCCs(const std::vector<const re::CC *> & CCs,
[5369]54                    std::vector<std::vector<unsigned>> & exclusiveSetIDs,
[5748]55                    std::vector<re::CC *> & multiplexedCCs) {
[5369]56   
[5748]57    const auto breakpoints = computeBreakpoints(CCs);
[5369]58    // Initialize the exclusiveSetIDs to have one empty vector per source CC.
59    exclusiveSetIDs.clear();
60    exclusiveSetIDs.resize(CCs.size());
61    //
62    // Exclusive set determination.   
63    //
64    // Set up a map from the set of source CCs for each exclusive set to the exclusive set index.
65
66    std::map<boost::dynamic_bitset<>, unsigned> CC_set_to_exclusive_set_map;
67
68    // Entry 0 is for the characters not in any of the CCs.
69    CC_set_to_exclusive_set_map.emplace(boost::dynamic_bitset<>(CCs.size()), 0);
70
71    unsigned current_exclusive_set_idx = 0;
72    unsigned multiplexed_bit_count = 0;
[5490]73    boost::dynamic_bitset<> current_set(CCs.size());
[5369]74   
75    unsigned range_lo = 0;
[5741]76    unsigned next_set_index = 1;
[5369]77    for (auto & bkpt_entry : breakpoints) {
78        if (current_exclusive_set_idx > 0) {  // We have a range entry to close for a pending exclusive set.
79            unsigned range_hi = bkpt_entry.first - 1;
80            for (unsigned bit = 0; bit < multiplexed_bit_count; bit++) {
81                if (((current_exclusive_set_idx >> bit) & 1) == 1) {
[5748]82                    multiplexedCCs[bit]->insert_range(range_lo, range_hi);
[5369]83                }
84            }
85        }
86        // Start a new range.
87        range_lo = bkpt_entry.first;
[5741]88        if (range_lo > UCD::UNICODE_MAX) continue; // Nothing to do for bkpt 0x110000
[5369]89        current_set ^= bkpt_entry.second;
90        auto idx_iter = CC_set_to_exclusive_set_map.find(current_set);
91        if (idx_iter == CC_set_to_exclusive_set_map.end()) {
92            // New exclusive class; assign the next sequential integer.
[5490]93            //current_exclusive_set_idx = exclusiveSetIDs.size();
[5741]94            current_exclusive_set_idx = next_set_index;
95            next_set_index++;
[5369]96            CC_set_to_exclusive_set_map.emplace(current_set, current_exclusive_set_idx);
97           
98            for (unsigned CC1 = current_set.find_first(); CC1 < CCs.size(); CC1 = current_set.find_next(CC1)) {
99                exclusiveSetIDs[CC1].push_back(current_exclusive_set_idx);
100            }
101            if ((current_exclusive_set_idx & (current_exclusive_set_idx - 1)) == 0) {
102                multiplexed_bit_count++;
[5748]103                multiplexedCCs.push_back(re::makeCC());
[5369]104            }
105        }
106        else {
107            current_exclusive_set_idx = idx_iter->second;
108        }
109    }
110}
[5795]111
112
113
114MultiplexedAlphabet::MultiplexedAlphabet(std::string alphabetName, const std::vector<const re::CC *> CCs) 
[5799]115    : Alphabet(alphabetName), mUnicodeSets(CCs) {
116        if (CCs.size() > 0) {
117            mSourceAlphabet = CCs[0]->getAlphabet();
118            for (unsigned i = 1; i < CCs.size(); i++) {
119                if (CCs[i]->getAlphabet() != mSourceAlphabet) llvm::report_fatal_error("Mismatched source alphabets for Multiplexed Alphabet");
120            }
121        }
[5795]122        cc::doMultiplexCCs(CCs, mExclusiveSetIDs, mMultiplexedCCs);
123}
124
[5799]125const Alphabet * MultiplexedAlphabet::getSourceAlphabet() {
126    return mSourceAlphabet;
127}
128
[5795]129std::vector<std::vector<unsigned>> MultiplexedAlphabet::getExclusiveSetIDs() { 
130    return mExclusiveSetIDs;
131}
132
133std::vector<re::CC *> MultiplexedAlphabet::getMultiplexedCCs() {
134    return mMultiplexedCCs;
135}
[5799]136   
137re::CC * MultiplexedAlphabet::transformCC(re::CC * sourceCC) {
138    if (sourceCC->getAlphabet() != mSourceAlphabet) llvm::report_fatal_error("Mismatched source alphabets for transformCC");
139   
140    const auto index = find(mUnicodeSets.begin(), mUnicodeSets.end(), sourceCC) - mUnicodeSets.begin();
141    const auto exclusive_IDs = mExclusiveSetIDs[index];
142    re::CC * CC_union = re::makeCC(this);
143    for (auto i : exclusive_IDs) {
144        CC_union = re::makeCC(CC_union, re::makeCC(i, this));
145    }
146    return CC_union;
[5795]147}
[5799]148}
[5795]149
Note: See TracBrowser for help on using the repository browser.