source: icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp @ 5727

Last change on this file since 5727 was 5727, checked in by cameron, 19 months ago

Small fixes, constructing/testing full UnicodeSets?.

File size: 4.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include <map>
7#include <UCD/unicode_set.h>
8#include <re/re_cc.h>
9#include "boost/dynamic_bitset.hpp"
10
11//
12// Breakpoints of a set of character classes (CCs): each codepoint c such that
13// there is some CC in CCs such that either (a) c is in the CC and c-1 is not, or
14// (b) c-1 is in the CC and c is not.
15//
16// The breakpoints may be determined by iterating through the interval
17// representation of each CC.   For each interval (lo, hi), lo and hi+1
18// are breakpoints.
19//
20// For each breakpoint, a bitset is computed identifying the source CCs whose
21// status changes at the breakpoint.
22//
23
24std::map<UCD::codepoint_t, boost::dynamic_bitset<>> computeBreakpoints(std::vector<UCD::UnicodeSet> CCs) {
25    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints;
26    for (unsigned i = 0; i < CCs.size(); i++) {
27        for (const UCD::UnicodeSet::interval_t & range : CCs[i]) {
28            auto lo = re::lo_codepoint(range);
29            auto hi = re::hi_codepoint(range);
30            auto f = breakpoints.find(lo);
31            if (f == breakpoints.end()) {
32                breakpoints.emplace(lo, boost::dynamic_bitset<>(CCs.size()));
33            }
34            breakpoints[lo].set(i);
35            f = breakpoints.find(hi + 1);
36            if (f == breakpoints.end()) {
37                breakpoints.emplace(hi+1, boost::dynamic_bitset<>(CCs.size()));
38            }
39            breakpoints[hi+1].set(i);
40        }
41    }
42    return breakpoints;
43}
44
45
46void doMultiplexCCs(const std::vector<UCD::UnicodeSet> & CCs,
47                    std::vector<std::vector<unsigned>> & exclusiveSetIDs,
48                    std::vector<UCD::UnicodeSet> & multiplexedCCs) {
49   
50    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints = computeBreakpoints(CCs);
51    // Initialize the exclusiveSetIDs to have one empty vector per source CC.
52    exclusiveSetIDs.clear();
53    exclusiveSetIDs.resize(CCs.size());
54    //
55    // Exclusive set determination.   
56    //
57    // Set up a map from the set of source CCs for each exclusive set to the exclusive set index.
58
59    std::map<boost::dynamic_bitset<>, unsigned> CC_set_to_exclusive_set_map;
60
61    // Entry 0 is for the characters not in any of the CCs.
62    CC_set_to_exclusive_set_map.emplace(boost::dynamic_bitset<>(CCs.size()), 0);
63
64    unsigned current_exclusive_set_idx = 0;
65    unsigned multiplexed_bit_count = 0;
66    boost::dynamic_bitset<> current_set(CCs.size());
67   
68    unsigned range_lo = 0;
69    unsigned cur_index = 1;
70    for (auto & bkpt_entry : breakpoints) {
71        if (current_exclusive_set_idx > 0) {  // We have a range entry to close for a pending exclusive set.
72            unsigned range_hi = bkpt_entry.first - 1;
73            for (unsigned bit = 0; bit < multiplexed_bit_count; bit++) {
74                if (((current_exclusive_set_idx >> bit) & 1) == 1) {
75                    multiplexedCCs[bit].insert_range(range_lo, range_hi);
76                }
77            }
78        }
79        // Start a new range.
80        range_lo = bkpt_entry.first;
81        if (range_lo > UCD::UNICODE_MAX) continue; 
82        current_set ^= bkpt_entry.second;
83        auto idx_iter = CC_set_to_exclusive_set_map.find(current_set);
84        if (idx_iter == CC_set_to_exclusive_set_map.end()) {
85            // New exclusive class; assign the next sequential integer.
86            //current_exclusive_set_idx = exclusiveSetIDs.size();
87            current_exclusive_set_idx = cur_index;
88            cur_index++;
89            CC_set_to_exclusive_set_map.emplace(current_set, current_exclusive_set_idx);
90           
91            for (unsigned CC1 = current_set.find_first(); CC1 < CCs.size(); CC1 = current_set.find_next(CC1)) {
92                exclusiveSetIDs[CC1].push_back(current_exclusive_set_idx);
93            }
94            if ((current_exclusive_set_idx & (current_exclusive_set_idx - 1)) == 0) {
95                multiplexed_bit_count++;
96                multiplexedCCs.push_back(UCD::UnicodeSet());
97            }
98        }
99        else {
100            current_exclusive_set_idx = idx_iter->second;
101        }
102    }
103    assert (current_exclusive_set_idx == 0 && "Breakpoint for final interval not found.");
104}
Note: See TracBrowser for help on using the repository browser.