]>
Commit | Line | Data |
---|---|---|
767698ff | 1 | // Copyright (C) 2020-2024 Free Software Foundation, Inc. |
619f1874 RT |
2 | |
3 | // This file is part of GCC. | |
4 | ||
5 | // GCC is free software; you can redistribute it and/or modify it under | |
6 | // the terms of the GNU General Public License as published by the Free | |
7 | // Software Foundation; either version 3, or (at your option) any later | |
8 | // version. | |
9 | ||
10 | // GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
11 | // WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
13 | // for more details. | |
14 | ||
15 | // You should have received a copy of the GNU General Public License | |
16 | // along with GCC; see the file COPYING3. If not see | |
17 | // <http://www.gnu.org/licenses/>. | |
18 | ||
19 | // This file provides functions for punycode conversion | |
20 | // See https://datatracker.ietf.org/doc/html/rfc3492 | |
21 | ||
22 | #include "rust-system.h" | |
23 | #include "rust-unicode.h" | |
24 | #include "optional.h" | |
25 | #include "selftest.h" | |
26 | ||
27 | namespace Rust { | |
28 | ||
29 | // https://tools.ietf.org/html/rfc3492#section-4. | |
30 | constexpr uint32_t BASE = 36; | |
31 | constexpr uint32_t TMIN = 1; | |
32 | constexpr uint32_t TMAX = 26; | |
33 | constexpr uint32_t SKEW = 38; | |
34 | constexpr uint32_t DAMP = 700; | |
35 | constexpr uint32_t INITIAL_BIAS = 72; | |
36 | constexpr uint32_t INITIAL_N = 128; | |
37 | constexpr char DELIMITER = '-'; | |
38 | ||
619f1874 RT |
39 | std::string |
40 | extract_basic_string (const std::vector<Codepoint> &src) | |
41 | { | |
42 | std::string basic_string; | |
43 | for (auto c : src) | |
44 | { | |
52ad16ef | 45 | if (c.is_ascii ()) |
619f1874 RT |
46 | basic_string += c.as_string (); |
47 | } | |
48 | return basic_string; | |
49 | } | |
50 | ||
51 | uint32_t | |
52 | adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first) | |
53 | { | |
54 | delta /= is_first ? DAMP : 2; | |
55 | delta += delta / n_points; | |
56 | uint32_t k = 0; | |
57 | ||
58 | while (delta > (BASE - TMIN) * TMAX / 2) | |
59 | { | |
60 | delta /= BASE - TMIN; | |
61 | k += BASE; | |
62 | } | |
63 | return k + (BASE - TMIN + 1) * delta / (delta + SKEW); | |
64 | } | |
65 | ||
66 | uint32_t | |
67 | clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs, | |
68 | const uint32_t max) | |
69 | { | |
70 | if (min + rhs >= lhs) | |
71 | return min; | |
72 | else if (max + rhs <= lhs) | |
73 | return max; | |
74 | else | |
75 | return lhs - rhs; | |
76 | } | |
77 | ||
78 | uint32_t | |
79 | min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold) | |
80 | { | |
81 | uint32_t min = UINT32_MAX; | |
82 | for (auto c : l) | |
83 | if (c.value >= threshold && c.value < min) | |
84 | min = c.value; | |
85 | return min; | |
86 | } | |
87 | ||
88 | char | |
89 | encode_digit (const uint32_t d) | |
90 | { | |
91 | return d + 22 + (d < 26 ? 75 : 0); | |
92 | } | |
93 | ||
94 | tl::optional<std::string> | |
95 | encode_punycode (const Utf8String &input) | |
96 | { | |
97 | std::vector<Codepoint> input_chars = input.get_chars (); | |
98 | ||
99 | uint32_t n = INITIAL_N; | |
100 | uint32_t delta = 0; | |
101 | uint32_t bias = INITIAL_BIAS; | |
102 | ||
103 | std::string output = extract_basic_string (input_chars); | |
104 | uint32_t h = output.size (); | |
105 | const uint32_t b = h; | |
106 | if (b > 0) | |
107 | output += DELIMITER; | |
108 | ||
109 | while (h < input_chars.size ()) | |
110 | { | |
111 | const uint32_t m = min_gt_or_eq (input_chars, n); | |
112 | ||
113 | if (m - n > ((UINT32_MAX - delta) / (h + 1))) | |
114 | return tl::nullopt; | |
115 | ||
116 | delta += (m - n) * (h + 1); | |
117 | n = m; | |
118 | ||
119 | for (const auto c : input_chars) | |
120 | { | |
121 | if (c.value < n) | |
122 | delta++; | |
123 | else if (c.value == n) | |
124 | { | |
125 | uint32_t q = delta; | |
126 | // encode as a variable length integer | |
127 | for (uint32_t k = 1;; k++) | |
128 | { | |
129 | const uint32_t kb = k * BASE; | |
130 | const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX); | |
131 | if (q < t) | |
132 | break; | |
133 | ||
134 | output += encode_digit (t + (q - t) % (BASE - t)); | |
135 | q = (q - t) / (BASE - t); | |
136 | } | |
137 | output += encode_digit (q); | |
138 | ||
139 | bias = adapt_bias (delta, h + 1, h == b); | |
140 | delta = 0; | |
141 | h++; | |
142 | } | |
143 | } | |
144 | delta++; | |
145 | n++; | |
146 | } | |
147 | ||
148 | return {output}; | |
149 | } | |
150 | ||
151 | } // namespace Rust | |
152 | ||
0ebb0a75 OA |
153 | #if CHECKING_P |
154 | ||
619f1874 RT |
155 | namespace selftest { |
156 | ||
157 | void | |
158 | encode_assert (const std::string &input, const std::string &expected) | |
159 | { | |
160 | Rust::Utf8String input_utf8 | |
161 | = Rust::Utf8String::make_utf8_string (input).value (); | |
162 | std::string actual = Rust::encode_punycode (input_utf8).value (); | |
163 | ASSERT_EQ (actual, expected); | |
164 | } | |
165 | ||
166 | void | |
167 | rust_punycode_encode_test () | |
168 | { | |
169 | encode_assert ("abc", "abc-"); | |
170 | encode_assert ("12345", "12345-"); | |
171 | encode_assert ("香港", "j6w193g"); | |
172 | ||
173 | // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1 | |
174 | encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn"); | |
175 | encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye"); | |
176 | encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb"); | |
177 | encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a"); | |
178 | } | |
179 | ||
180 | } // namespace selftest | |
0ebb0a75 OA |
181 | |
182 | #endif // CHECKING_P |