source: icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp @ 5795

Last change on this file since 5795 was 5795, checked in by cameron, 13 months ago

Adding Alphabet to CCs: initial check-in

File size: 4.7 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include <map>
7#include <UCD/unicode_set.h>
8#include <re/re_cc.h>
9#include "boost/dynamic_bitset.hpp"
10#include <cc/multiplex_CCs.h>
11
12namespace cc {
13
14//
15// Breakpoints of a set of character classes (CCs): each codepoint c such that
16// there is some CC in CCs such that either (a) c is in the CC and c-1 is not, or
17// (b) c-1 is in the CC and c is not.  Boundary cases: if codepoint 0 is in
18// some CC, then 0 is a breakpoint (codepoint -1 is not in any CC).  If codepoint
19// 0x10FFFF is in some CC then 0x110000 is a breakpoint.
20//
21// The breakpoints may be determined by iterating through the interval
22// representation of each CC.   For each interval (lo, hi), lo and hi+1
23// are breakpoints.
24//
25// For each breakpoint, a bitset is computed identifying the source CCs whose
26// status changes at the breakpoint.
27//
28
29std::map<UCD::codepoint_t, boost::dynamic_bitset<>> computeBreakpoints(const std::vector<const re::CC *> & CCs) {
30    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints;
31    for (unsigned i = 0; i < CCs.size(); i++) {
32        for (const auto range : *CCs[i]) {
33            const auto lo = re::lo_codepoint(range);
34            const auto hi = re::hi_codepoint(range);
35            auto f = breakpoints.find(lo);
36            if (f == breakpoints.end()) {
37                breakpoints.emplace(lo, boost::dynamic_bitset<>(CCs.size()));
38            }
39            breakpoints[lo].set(i);
40            f = breakpoints.find(hi + 1);
41            if (f == breakpoints.end()) {
42                breakpoints.emplace(hi+1, boost::dynamic_bitset<>(CCs.size()));
43            }
44            breakpoints[hi+1].set(i);
45        }
46    }
47    return breakpoints;
48}
49
50
51void doMultiplexCCs(const std::vector<const re::CC *> & CCs,
52                    std::vector<std::vector<unsigned>> & exclusiveSetIDs,
53                    std::vector<re::CC *> & multiplexedCCs) {
54   
55    const auto breakpoints = computeBreakpoints(CCs);
56    // Initialize the exclusiveSetIDs to have one empty vector per source CC.
57    exclusiveSetIDs.clear();
58    exclusiveSetIDs.resize(CCs.size());
59    //
60    // Exclusive set determination.   
61    //
62    // Set up a map from the set of source CCs for each exclusive set to the exclusive set index.
63
64    std::map<boost::dynamic_bitset<>, unsigned> CC_set_to_exclusive_set_map;
65
66    // Entry 0 is for the characters not in any of the CCs.
67    CC_set_to_exclusive_set_map.emplace(boost::dynamic_bitset<>(CCs.size()), 0);
68
69    unsigned current_exclusive_set_idx = 0;
70    unsigned multiplexed_bit_count = 0;
71    boost::dynamic_bitset<> current_set(CCs.size());
72   
73    unsigned range_lo = 0;
74    unsigned next_set_index = 1;
75    for (auto & bkpt_entry : breakpoints) {
76        if (current_exclusive_set_idx > 0) {  // We have a range entry to close for a pending exclusive set.
77            unsigned range_hi = bkpt_entry.first - 1;
78            for (unsigned bit = 0; bit < multiplexed_bit_count; bit++) {
79                if (((current_exclusive_set_idx >> bit) & 1) == 1) {
80                    multiplexedCCs[bit]->insert_range(range_lo, range_hi);
81                }
82            }
83        }
84        // Start a new range.
85        range_lo = bkpt_entry.first;
86        if (range_lo > UCD::UNICODE_MAX) continue; // Nothing to do for bkpt 0x110000
87        current_set ^= bkpt_entry.second;
88        auto idx_iter = CC_set_to_exclusive_set_map.find(current_set);
89        if (idx_iter == CC_set_to_exclusive_set_map.end()) {
90            // New exclusive class; assign the next sequential integer.
91            //current_exclusive_set_idx = exclusiveSetIDs.size();
92            current_exclusive_set_idx = next_set_index;
93            next_set_index++;
94            CC_set_to_exclusive_set_map.emplace(current_set, current_exclusive_set_idx);
95           
96            for (unsigned CC1 = current_set.find_first(); CC1 < CCs.size(); CC1 = current_set.find_next(CC1)) {
97                exclusiveSetIDs[CC1].push_back(current_exclusive_set_idx);
98            }
99            if ((current_exclusive_set_idx & (current_exclusive_set_idx - 1)) == 0) {
100                multiplexed_bit_count++;
101                multiplexedCCs.push_back(re::makeCC());
102            }
103        }
104        else {
105            current_exclusive_set_idx = idx_iter->second;
106        }
107    }
108}
109
110
111
112MultiplexedAlphabet::MultiplexedAlphabet(std::string alphabetName, const std::vector<const re::CC *> CCs) 
113    : Alphabet(alphabetName) {
114        cc::doMultiplexCCs(CCs, mExclusiveSetIDs, mMultiplexedCCs);
115}
116
117std::vector<std::vector<unsigned>> MultiplexedAlphabet::getExclusiveSetIDs() { 
118    return mExclusiveSetIDs;
119}
120
121std::vector<re::CC *> MultiplexedAlphabet::getMultiplexedCCs() {
122    return mMultiplexedCCs;
123}
124}
125
Note: See TracBrowser for help on using the repository browser.