1 | /* |
---|
2 | * Copyright (c) 2014 International Characters. |
---|
3 | * This software is licensed to the public under the Open Software License 3.0. |
---|
4 | * icgrep is a trademark of International Characters. |
---|
5 | */ |
---|
6 | |
---|
7 | #include <utf16_encoder.h> |
---|
8 | #include <assert.h> |
---|
9 | #include <algorithm> |
---|
10 | #include <stdexcept> |
---|
11 | |
---|
12 | using namespace UCD; |
---|
13 | |
---|
14 | namespace cc { |
---|
15 | |
---|
16 | bool UTF16_Encoder::isHi_Surrogate(const codepoint_t cp) { |
---|
17 | return (cp >= 0xD800) && (cp <= 0xDBFF); |
---|
18 | } |
---|
19 | |
---|
20 | bool UTF16_Encoder::isLo_Surrogate(const codepoint_t cp) { |
---|
21 | return (cp >= 0xDC00) && (cp <= 0xDFFF); |
---|
22 | } |
---|
23 | |
---|
24 | codepoint_t UTF16_Encoder::encodingByte(const codepoint_t cp, const unsigned n) { |
---|
25 | codepoint_t retVal = 0; |
---|
26 | const unsigned len = length(cp); |
---|
27 | if (len == 1) { |
---|
28 | retVal = cp; |
---|
29 | } |
---|
30 | else { |
---|
31 | codepoint_t code = cp - 0x010000; |
---|
32 | if (n == 1) { |
---|
33 | retVal = (code >> 10) | 0xD800; |
---|
34 | } |
---|
35 | if (n == 2) { |
---|
36 | retVal = (code & 0x3FF) | 0xDC00; |
---|
37 | } |
---|
38 | } |
---|
39 | return retVal; |
---|
40 | } |
---|
41 | |
---|
42 | unsigned UTF16_Encoder::length(const codepoint_t cp) { |
---|
43 | if (cp <= 0xFFFF) { |
---|
44 | return 1; |
---|
45 | } |
---|
46 | else { |
---|
47 | return 2; |
---|
48 | } |
---|
49 | } |
---|
50 | |
---|
51 | codepoint_t UTF16_Encoder::maxCodePoint(const unsigned length) { |
---|
52 | if (length == 1) { |
---|
53 | return 0xFFFF; |
---|
54 | } |
---|
55 | else if (length == 2) { |
---|
56 | return 0x10FFFF; |
---|
57 | } |
---|
58 | throw std::runtime_error("Unexpected UTF16 Length: " + std::to_string(length)); |
---|
59 | } |
---|
60 | |
---|
61 | bool UTF16_Encoder::isLowCodePointAfterByte(const codepoint_t cp, const unsigned n) { |
---|
62 | const auto l = length(cp); |
---|
63 | for (auto i = n; i != l; ++i) { |
---|
64 | if (encodingByte(cp, i + 1) != 0xDC00) { |
---|
65 | return false; |
---|
66 | } |
---|
67 | } |
---|
68 | return true; |
---|
69 | } |
---|
70 | |
---|
71 | bool UTF16_Encoder::isHighCodePointAfterByte(const codepoint_t cp, const unsigned n) { |
---|
72 | const auto l = length(cp); |
---|
73 | for (auto i = n; i != l; ++i) { |
---|
74 | if (encodingByte(cp, i + 1) != 0xDFFF) { |
---|
75 | return false; |
---|
76 | } |
---|
77 | } |
---|
78 | return true; |
---|
79 | } |
---|
80 | |
---|
81 | codepoint_t UTF16_Encoder::minCodePointWithCommonBytes(const codepoint_t cp, const unsigned n) { |
---|
82 | const auto len = length(cp); |
---|
83 | const auto mask = (static_cast<codepoint_t>(1) << (len - n) * 10) - 1; |
---|
84 | const auto lo_cp = cp &~ mask; |
---|
85 | return (lo_cp == 0) ? mask + 1 : lo_cp; |
---|
86 | } |
---|
87 | |
---|
88 | codepoint_t UTF16_Encoder::maxCodePointWithCommonBytes(const codepoint_t cp, const unsigned n) { |
---|
89 | const auto len = length(cp); |
---|
90 | const auto mask = (static_cast<codepoint_t>(1) << (len - n) * 10) - 1; |
---|
91 | return cp | mask; |
---|
92 | } |
---|
93 | |
---|
94 | } |
---|