source: icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp @ 6133

Last change on this file since 6133 was 6133, checked in by xwa163, 12 months ago
  1. Add sourceCC in multiplexed CC
  2. Remove workaround FakeBasisBits? from ICGrep
  3. Implement Swizzled version of LZParabix
  4. Init checkin for SwizzleByGather? Kernel
File size: 6.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include <map>
7#include <UCD/unicode_set.h>
8#include <re/re_cc.h>
9#include "boost/dynamic_bitset.hpp"
10#include <cc/multiplex_CCs.h>
11#include <re/printer_re.h>
12#include <llvm/Support/Casting.h>
13#include <llvm/Support/ErrorHandling.h>
14#include <llvm/Support/raw_ostream.h>
15
16namespace cc {
17
18//
19// Breakpoints of a set of character classes (CCs): each codepoint c such that
20// there is some CC in CCs such that either (a) c is in the CC and c-1 is not, or
21// (b) c-1 is in the CC and c is not.  Boundary cases: if codepoint 0 is in
22// some CC, then 0 is a breakpoint (codepoint -1 is not in any CC).  If codepoint
23// 0x10FFFF is in some CC then 0x110000 is a breakpoint.
24//
25// The breakpoints may be determined by iterating through the interval
26// representation of each CC.   For each interval (lo, hi), lo and hi+1
27// are breakpoints.
28//
29// For each breakpoint, a bitset is computed identifying the source CCs whose
30// status changes at the breakpoint.
31//
32
33std::map<UCD::codepoint_t, boost::dynamic_bitset<>> computeBreakpoints(const std::vector<re::CC *> & CCs) {
34    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints;
35    for (unsigned i = 0; i < CCs.size(); i++) {
36        for (const auto range : *CCs[i]) {
37            const auto lo = re::lo_codepoint(range);
38            const auto hi = re::hi_codepoint(range);
39            auto f = breakpoints.find(lo);
40            if (f == breakpoints.end()) {
41                breakpoints.emplace(lo, boost::dynamic_bitset<>(CCs.size()));
42            }
43            breakpoints[lo].set(i);
44            f = breakpoints.find(hi + 1);
45            if (f == breakpoints.end()) {
46                breakpoints.emplace(hi+1, boost::dynamic_bitset<>(CCs.size()));
47            }
48            breakpoints[hi+1].set(i);
49        }
50    }
51    return breakpoints;
52}
53
54void doMultiplexCCs(const std::vector<re::CC *> & CCs,
55                    std::vector<std::vector<unsigned>> & exclusiveSetIDs,
56                    std::vector<re::CC *> & multiplexedCCs) {
57   
58    const auto breakpoints = computeBreakpoints(CCs);
59    // Initialize the exclusiveSetIDs to have one empty vector per source CC.
60    exclusiveSetIDs.clear();
61    exclusiveSetIDs.resize(CCs.size());
62    //
63    // Exclusive set determination.   
64    //
65    // Set up a map from the set of source CCs for each exclusive set to the exclusive set index.
66
67    std::map<boost::dynamic_bitset<>, unsigned> CC_set_to_exclusive_set_map;
68
69    // Entry 0 is for the characters not in any of the CCs.
70    CC_set_to_exclusive_set_map.emplace(boost::dynamic_bitset<>(CCs.size()), 0);
71
72    unsigned current_exclusive_set_idx = 0;
73    unsigned multiplexed_bit_count = 0;
74    boost::dynamic_bitset<> current_set(CCs.size());
75   
76    unsigned range_lo = 0;
77    unsigned next_set_index = 1;
78    for (auto & bkpt_entry : breakpoints) {
79        if (current_exclusive_set_idx > 0) {  // We have a range entry to close for a pending exclusive set.
80            unsigned range_hi = bkpt_entry.first - 1;
81            for (unsigned bit = 0; bit < multiplexed_bit_count; bit++) {
82                if (((current_exclusive_set_idx >> bit) & 1) == 1) {
83                    multiplexedCCs[bit]->insert_range(range_lo, range_hi);
84                }
85            }
86        }
87        // Start a new range.
88        range_lo = bkpt_entry.first;
89        if (range_lo > UCD::UNICODE_MAX) continue; // Nothing to do for bkpt 0x110000
90        current_set ^= bkpt_entry.second;
91        auto idx_iter = CC_set_to_exclusive_set_map.find(current_set);
92        if (idx_iter == CC_set_to_exclusive_set_map.end()) {
93            // New exclusive class; assign the next sequential integer.
94            //current_exclusive_set_idx = exclusiveSetIDs.size();
95            current_exclusive_set_idx = next_set_index;
96            next_set_index++;
97            CC_set_to_exclusive_set_map.emplace(current_set, current_exclusive_set_idx);
98           
99            for (unsigned CC1 = current_set.find_first(); CC1 < CCs.size(); CC1 = current_set.find_next(CC1)) {
100                exclusiveSetIDs[CC1].push_back(current_exclusive_set_idx);
101            }
102            if ((current_exclusive_set_idx & (current_exclusive_set_idx - 1)) == 0) {
103                multiplexed_bit_count++;
104                multiplexedCCs.push_back(re::makeCC());
105            }
106        }
107        else {
108            current_exclusive_set_idx = idx_iter->second;
109        }
110    }
111}
112
113
114
115MultiplexedAlphabet::MultiplexedAlphabet(std::string alphabetName, const std::vector<re::CC *> CCs) 
116    : Alphabet(alphabetName, ClassTypeId::MultiplexedAlphabet), mUnicodeSets(CCs) {
117        if (CCs.size() > 0) {
118            mSourceAlphabet = CCs[0]->getAlphabet();
119            for (unsigned i = 1; i < CCs.size(); i++) {
120                if (CCs[i]->getAlphabet() != mSourceAlphabet) llvm::report_fatal_error("Mismatched source alphabets for Multiplexed Alphabet");
121            }
122        }
123        cc::doMultiplexCCs(CCs, mExclusiveSetIDs, mMultiplexedCCs);
124}
125   
126re::CC * MultiplexedAlphabet::transformCC(const re::CC * sourceCC) const {
127    if (sourceCC->getAlphabet() != mSourceAlphabet) {
128        llvm::report_fatal_error("Mismatched source alphabets for transformCC");
129    }
130    const auto f = find(mUnicodeSets.begin(), mUnicodeSets.end(), sourceCC);
131    if (f == mUnicodeSets.end()) {
132        llvm::report_fatal_error(Printer_RE::PrintRE(sourceCC) + " not found");
133    }
134    const auto index = f - mUnicodeSets.begin();
135    const auto exclusive_IDs = mExclusiveSetIDs[index];
136    re::CC * CC_union = re::makeCC(this);
137    for (auto i : exclusive_IDs) {
138        CC_union = re::makeCC(CC_union, re::makeCC(i, this));
139    }
140    CC_union->sourceCC = sourceCC;
141    return CC_union;
142}
143
144re::CC * MultiplexedAlphabet::invertCC(const re::CC * transformedCC) const {
145    if (transformedCC->getAlphabet() != this) llvm::report_fatal_error("invertCC applied to non-transformed CC");
146    re::CC * CC_union = re::makeCC(mSourceAlphabet);
147    for (const UCD::interval_t i : *transformedCC) {
148        for (unsigned cp = re::lo_codepoint(i); cp <= re::hi_codepoint(i); cp++) {
149            CC_union = re::makeCC(mUnicodeSets[cp], CC_union);
150        }
151    }
152    return CC_union;
153}
154   
155
156   
157}
158
Note: See TracBrowser for help on using the repository browser.