- Timestamp:
- Aug 5, 2018, 6:53:07 PM (6 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp
r6138 r6139 29 29 using namespace re; 30 30 31 // Constants for computation of Hangul decompositions, see Unicode Standard, section 3.12. 32 const codepoint_t Hangul_SBase = 0xAC00; 33 const codepoint_t Hangul_LBase = 0x1100; 34 const codepoint_t Hangul_VBase = 0x1161; 35 const codepoint_t Hangul_TBase = 0x11A7; 36 const unsigned Hangul_TCount = 28; 37 const unsigned Hangul_NCount = 588; 38 const unsigned Hangul_SCount = 11172; 39 static UnicodeSet HangulPrecomposed = UnicodeSet(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1); 40 41 static RE * HangulDecomposition(codepoint_t cp) { 42 auto SIndex = cp - Hangul_SBase; 43 auto LIndex = SIndex / Hangul_NCount; 44 auto VIndex = (SIndex % Hangul_NCount) / Hangul_TCount; 45 auto TIndex = SIndex % Hangul_TCount; 46 auto L = makeCC(Hangul_LBase + LIndex); 47 auto V = makeCC(Hangul_VBase + VIndex); 48 if (TIndex > 0) { 49 return makeSeq({L, V, makeCC(Hangul_TBase + TIndex)}); 50 } else { 51 return makeSeq({L, V}); 52 } 53 } 54 31 55 RE * NFD_CC(CC * cc) { 32 56 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; … … 34 58 const auto & decompTypeObj = cast<EnumeratedPropertyObject>(property_object_table[dt]); 35 59 UnicodeSet canonicalMapped = decompTypeObj->GetCodepointSet(DT_ns::Can); 36 UnicodeSet mappingRequired = *cc & canonicalMapped;60 UnicodeSet mappingRequired = *cc & (canonicalMapped + HangulPrecomposed); 37 61 if (mappingRequired.empty()) return cc; 38 62 std::vector<RE *> alts; … … 40 64 for (const interval_t & i : mappingRequired) { 41 65 for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) { 42 std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp)); 43 RE * dm = u32string2re(dms); 44 if (Seq * s = dyn_cast<Seq>(dm)) { 45 if (s->size() == 1) { 46 finalCC = makeCC(finalCC, cast<CC>(s->front())); 66 if (HangulPrecomposed.contains(cp)) { 67 alts.push_back(HangulDecomposition(cp)); 68 } else { 69 std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp)); 70 RE * dm = u32string2re(dms); 71 if (Seq * s = dyn_cast<Seq>(dm)) { 72 if (s->size() == 1) { 73 finalCC = makeCC(finalCC, cast<CC>(s->front())); 74 } else { 75 alts.push_back(s); 76 } 47 77 } else { 48 alts.push_back( s);78 alts.push_back(dm); 49 79 } 50 } else {51 alts.push_back(dm);52 80 } 53 81 } … … 56 84 return makeAlt(alts.begin(), alts.end()); 57 85 } 86 58 87 59 88 RE * NFKD_CC(CC * cc) { 60 89 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; 61 90 const auto & decompMappingObj = cast<StringPropertyObject>(property_object_table[dm]); 62 UnicodeSet reflexiveSet = decompMappingObj->GetReflexiveSet() ;91 UnicodeSet reflexiveSet = decompMappingObj->GetReflexiveSet() - HangulPrecomposed; 63 92 UnicodeSet mappingRequired = *cc - reflexiveSet; 64 93 if (mappingRequired.empty()) return cc; … … 67 96 for (const interval_t & i : mappingRequired) { 68 97 for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) { 69 std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp)); 70 RE * dm = u32string2re(dms); 71 if (Seq * s = dyn_cast<Seq>(dm)) { 72 if (s->size() == 1) { 73 finalCC = makeCC(finalCC, cast<CC>(s->front())); 98 if (HangulPrecomposed.contains(cp)) { 99 alts.push_back(HangulDecomposition(cp)); 100 } else { 101 std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp)); 102 RE * dm = u32string2re(dms); 103 if (Seq * s = dyn_cast<Seq>(dm)) { 104 if (s->size() == 1) { 105 finalCC = makeCC(finalCC, cast<CC>(s->front())); 106 } else { 107 alts.push_back(s); 108 } 74 109 } else { 75 alts.push_back( s);110 alts.push_back(dm); 76 111 } 77 } else {78 alts.push_back(dm);79 112 } 80 113 }
Note: See TracChangeset
for help on using the changeset viewer.