source: icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp @ 5369

Last change on this file since 5369 was 5369, checked in by cameron, 2 years ago

CC multiplexing

File size: 4.0 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include <map>
7#include <UCD/unicode_set.h>
8#include <re/re_cc.h>
9#include "boost/dynamic_bitset.hpp"
10
11//
12// Breakpoints of a set of character classes (CCs): each codepoint c such that
13// there is some CC in CCs such that either (a) c is in the CC and c-1 is not, or
14// (b) c-1 is in the CC and c is not.
15//
16// The breakpoints may be determined by iterating through the interval
17// representation of each CC.   For each interval (lo, hi), lo and hi+1
18// are breakpoints.
19//
20// For each breakpoint, a bitset is computed identifying the source CCs whose
21// status changes at the breakpoint.
22//
23
24std::map<UCD::codepoint_t, boost::dynamic_bitset<>> computeBreakpoints(std::vector<UCD::UnicodeSet> CCs) {
25    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints;
26    for (unsigned i = 0; i < CCs.size(); i++) {
27        for (const UCD::UnicodeSet::interval_t & range : CCs[i]) {
28            auto lo = re::lo_codepoint(range);
29            auto hi = re::hi_codepoint(range);
30            auto f = breakpoints.find(lo);
31            if (f == breakpoints.end()) {
32                breakpoints.emplace(lo, boost::dynamic_bitset<>(CCs.size()));
33            }
34            breakpoints[lo].set(i);
35            f = breakpoints.find(hi + 1);
36            if (f == breakpoints.end()) {
37                breakpoints.emplace(hi+1, boost::dynamic_bitset<>(CCs.size()));
38            }
39            breakpoints[hi+1].set(i);
40        }
41    }
42    return breakpoints;
43}
44
45
46void doMultiplexCCs(std::vector<UCD::UnicodeSet> CCs,
47                    std::vector<std::vector<unsigned>> & exclusiveSetIDs,
48                    std::vector<UCD::UnicodeSet> & multiplexedCCs) {
49   
50    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints = computeBreakpoints(CCs);
51    // Initialize the exclusiveSetIDs to have one empty vector per source CC.
52    exclusiveSetIDs.clear();
53    exclusiveSetIDs.resize(CCs.size());
54    //
55    // Exclusive set determination.   
56    //
57    // Set up a map from the set of source CCs for each exclusive set to the exclusive set index.
58
59    std::map<boost::dynamic_bitset<>, unsigned> CC_set_to_exclusive_set_map;
60
61    // Entry 0 is for the characters not in any of the CCs.
62    CC_set_to_exclusive_set_map.emplace(boost::dynamic_bitset<>(CCs.size()), 0);
63
64    unsigned current_exclusive_set_idx = 0;
65    unsigned multiplexed_bit_count = 0;
66    boost::dynamic_bitset<> current_set;
67   
68    unsigned range_lo = 0;
69    for (auto & bkpt_entry : breakpoints) {
70        if (current_exclusive_set_idx > 0) {  // We have a range entry to close for a pending exclusive set.
71            unsigned range_hi = bkpt_entry.first - 1;
72            for (unsigned bit = 0; bit < multiplexed_bit_count; bit++) {
73                if (((current_exclusive_set_idx >> bit) & 1) == 1) {
74                    multiplexedCCs[bit].insert_range(range_lo, range_hi);
75                }
76            }
77        }
78        // Start a new range.
79        range_lo = bkpt_entry.first;
80        current_set ^= bkpt_entry.second;
81        auto idx_iter = CC_set_to_exclusive_set_map.find(current_set);
82        if (idx_iter == CC_set_to_exclusive_set_map.end()) {
83            // New exclusive class; assign the next sequential integer.
84            current_exclusive_set_idx = exclusiveSetIDs.size();
85            CC_set_to_exclusive_set_map.emplace(current_set, current_exclusive_set_idx);
86           
87            for (unsigned CC1 = current_set.find_first(); CC1 < CCs.size(); CC1 = current_set.find_next(CC1)) {
88                exclusiveSetIDs[CC1].push_back(current_exclusive_set_idx);
89            }
90            if ((current_exclusive_set_idx & (current_exclusive_set_idx - 1)) == 0) {
91                multiplexed_bit_count++;
92                multiplexedCCs.push_back(UCD::UnicodeSet());
93            }
94        }
95        else {
96            current_exclusive_set_idx = idx_iter->second;
97        }
98    }
99    assert (current_exclusive_set_idx == 0 && "Breakpoint for final interval not found.");
100}
Note: See TracBrowser for help on using the repository browser.