source: icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp @ 6170

Last change on this file since 6170 was 6140, checked in by xwa163, 14 months ago

UTF-8 support for Multiplexing LZ4 Grep

File size: 6.5 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 */
5
6#include <map>
7#include <UCD/unicode_set.h>
8#include <re/re_cc.h>
9#include "boost/dynamic_bitset.hpp"
10#include <cc/multiplex_CCs.h>
11#include <re/printer_re.h>
12#include <llvm/Support/Casting.h>
13#include <llvm/Support/ErrorHandling.h>
14#include <llvm/Support/raw_ostream.h>
15
16namespace cc {
17
18//
19// Breakpoints of a set of character classes (CCs): each codepoint c such that
20// there is some CC in CCs such that either (a) c is in the CC and c-1 is not, or
21// (b) c-1 is in the CC and c is not.  Boundary cases: if codepoint 0 is in
22// some CC, then 0 is a breakpoint (codepoint -1 is not in any CC).  If codepoint
23// 0x10FFFF is in some CC then 0x110000 is a breakpoint.
24//
25// The breakpoints may be determined by iterating through the interval
26// representation of each CC.   For each interval (lo, hi), lo and hi+1
27// are breakpoints.
28//
29// For each breakpoint, a bitset is computed identifying the source CCs whose
30// status changes at the breakpoint.
31//
32
33std::map<UCD::codepoint_t, boost::dynamic_bitset<>> computeBreakpoints(const std::vector<re::CC *> & CCs) {
34    std::map<UCD::codepoint_t, boost::dynamic_bitset<>> breakpoints;
35    for (unsigned i = 0; i < CCs.size(); i++) {
36        for (const auto range : *CCs[i]) {
37            const auto lo = re::lo_codepoint(range);
38            const auto hi = re::hi_codepoint(range);
39            auto f = breakpoints.find(lo);
40            if (f == breakpoints.end()) {
41                breakpoints.emplace(lo, boost::dynamic_bitset<>(CCs.size()));
42            }
43            breakpoints[lo].set(i);
44            f = breakpoints.find(hi + 1);
45            if (f == breakpoints.end()) {
46                breakpoints.emplace(hi+1, boost::dynamic_bitset<>(CCs.size()));
47            }
48            breakpoints[hi+1].set(i);
49        }
50    }
51    return breakpoints;
52}
53
54void doMultiplexCCs(const std::vector<re::CC *> & CCs,
55                    std::vector<std::vector<unsigned>> & exclusiveSetIDs,
56                    std::vector<re::CC *> & multiplexedCCs) {
57   
58    const auto breakpoints = computeBreakpoints(CCs);
59    // Initialize the exclusiveSetIDs to have one empty vector per source CC.
60    exclusiveSetIDs.clear();
61    exclusiveSetIDs.resize(CCs.size());
62    //
63    // Exclusive set determination.   
64    //
65    // Set up a map from the set of source CCs for each exclusive set to the exclusive set index.
66
67    std::map<boost::dynamic_bitset<>, unsigned> CC_set_to_exclusive_set_map;
68
69    // Entry 0 is for the characters not in any of the CCs.
70    CC_set_to_exclusive_set_map.emplace(boost::dynamic_bitset<>(CCs.size()), 0);
71
72    unsigned current_exclusive_set_idx = 0;
73    unsigned multiplexed_bit_count = 0;
74    boost::dynamic_bitset<> current_set(CCs.size());
75   
76    unsigned range_lo = 0;
77    unsigned next_set_index = 1;
78    for (auto & bkpt_entry : breakpoints) {
79        if (current_exclusive_set_idx > 0) {  // We have a range entry to close for a pending exclusive set.
80            unsigned range_hi = bkpt_entry.first - 1;
81            for (unsigned bit = 0; bit < multiplexed_bit_count; bit++) {
82                if (((current_exclusive_set_idx >> bit) & 1) == 1) {
83                    multiplexedCCs[bit]->insert_range(range_lo, range_hi);
84                }
85            }
86        }
87        // Start a new range.
88        range_lo = bkpt_entry.first;
89        if (range_lo > UCD::UNICODE_MAX) continue; // Nothing to do for bkpt 0x110000
90        current_set ^= bkpt_entry.second;
91        auto idx_iter = CC_set_to_exclusive_set_map.find(current_set);
92        if (idx_iter == CC_set_to_exclusive_set_map.end()) {
93            // New exclusive class; assign the next sequential integer.
94            //current_exclusive_set_idx = exclusiveSetIDs.size();
95            current_exclusive_set_idx = next_set_index;
96            next_set_index++;
97            CC_set_to_exclusive_set_map.emplace(current_set, current_exclusive_set_idx);
98           
99            for (unsigned CC1 = current_set.find_first(); CC1 < CCs.size(); CC1 = current_set.find_next(CC1)) {
100                exclusiveSetIDs[CC1].push_back(current_exclusive_set_idx);
101            }
102            if ((current_exclusive_set_idx & (current_exclusive_set_idx - 1)) == 0) {
103                multiplexed_bit_count++;
104                multiplexedCCs.push_back(re::makeCC());
105            }
106        }
107        else {
108            current_exclusive_set_idx = idx_iter->second;
109        }
110    }
111}
112
113
114
115MultiplexedAlphabet::MultiplexedAlphabet(std::string alphabetName, const std::vector<re::CC *> CCs) 
116    : Alphabet(alphabetName, ClassTypeId::MultiplexedAlphabet), mUnicodeSets(CCs) {
117        if (CCs.size() > 0) {
118            mSourceAlphabet = CCs[0]->getAlphabet();
119            for (unsigned i = 1; i < CCs.size(); i++) {
120                if (CCs[i]->getAlphabet() != mSourceAlphabet) llvm::report_fatal_error("Mismatched source alphabets for Multiplexed Alphabet");
121            }
122        }
123        cc::doMultiplexCCs(CCs, mExclusiveSetIDs, mMultiplexedCCs);
124}
125
126unsigned long MultiplexedAlphabet::findTargetCCIndex(const re::CC * sourceCC) const {
127    auto f = find(mUnicodeSets.begin(), mUnicodeSets.end(), sourceCC);
128    if (f != mUnicodeSets.end()) {
129        return f - mUnicodeSets.begin();
130    }
131
132    const UCD::UnicodeSet* sourceUcd = sourceCC;
133    for (unsigned long i = 0; i < mUnicodeSets.size(); i++) {
134        UCD::UnicodeSet* t = mUnicodeSets[i];
135        if (*t == *sourceUcd) {
136            return i;
137        }
138    }
139    llvm::report_fatal_error(Printer_RE::PrintRE(sourceCC) + " not found");
140}
141
142re::CC * MultiplexedAlphabet::transformCC(const re::CC * sourceCC) const {
143    if (sourceCC->getAlphabet() != mSourceAlphabet) {
144        llvm::report_fatal_error("Mismatched source alphabets for transformCC");
145    }
146
147    const auto index = this->findTargetCCIndex(sourceCC);
148    const auto exclusive_IDs = mExclusiveSetIDs[index];
149    re::CC * CC_union = re::makeCC(this);
150    for (auto i : exclusive_IDs) {
151        CC_union = re::makeCC(CC_union, re::makeCC(i, this));
152    }
153    CC_union->sourceCC = sourceCC;
154    return CC_union;
155}
156
157re::CC * MultiplexedAlphabet::invertCC(const re::CC * transformedCC) const {
158    if (transformedCC->getAlphabet() != this) llvm::report_fatal_error("invertCC applied to non-transformed CC");
159    re::CC * CC_union = re::makeCC(mSourceAlphabet);
160    for (const UCD::interval_t i : *transformedCC) {
161        for (unsigned cp = re::lo_codepoint(i); cp <= re::hi_codepoint(i); cp++) {
162            CC_union = re::makeCC(mUnicodeSets[cp], CC_union);
163        }
164    }
165    return CC_union;
166}
167   
168
169   
170}
171
Note: See TracBrowser for help on using the repository browser.