]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/utf8.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / basic / utf8.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
7f110ff9
LP
2/***
3 This file is part of systemd.
4
036ae95a 5 Copyright 2008-2011 Kay Sievers
7f110ff9
LP
6 Copyright 2012 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
7f110ff9
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
7f110ff9 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
7f110ff9
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
036ae95a 22/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9
LP
23 * original license text follows. */
24
25/* gutf8.c - Operations on UTF-8 strings.
26 *
27 * Copyright (C) 1999 Tom Tromey
28 * Copyright (C) 2000 Red Hat, Inc.
29 *
30 * This library is free software; you can redistribute it and/or
23757887 31 * modify it under the terms of the GNU Library General Public
7f110ff9
LP
32 * License as published by the Free Software Foundation; either
33 * version 2 of the License, or (at your option) any later version.
34 *
35 * This library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887
SK
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Library General Public License for more details.
7f110ff9 39 *
23757887
SK
40 * You should have received a copy of the GNU Library General Public
41 * License along with this library; if not, write to the Free Software
42 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9
LP
43 */
44
45#include <errno.h>
7f110ff9 46#include <stdbool.h>
cf0fbc49
TA
47#include <stdlib.h>
48#include <string.h>
7f110ff9 49
b5efdb8a 50#include "alloc-util.h"
e4e73a63 51#include "hexdecoct.h"
11c3a366 52#include "macro.h"
7f110ff9
LP
53#include "utf8.h"
54
c932fb71 55bool unichar_is_valid(char32_t ch) {
7f110ff9
LP
56
57 if (ch >= 0x110000) /* End of unicode space */
58 return false;
59 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
60 return false;
61 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
62 return false;
63 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
64 return false;
65
66 return true;
67}
68
c932fb71 69static bool unichar_is_control(char32_t ch) {
ba961854
ZJS
70
71 /*
72 0 to ' '-1 is the C0 range.
73 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
74 '\t' is in C0 range, but more or less harmless and commonly used.
75 */
76
4c701096 77 return (ch < ' ' && !IN_SET(ch, '\t', '\n')) ||
ba961854
ZJS
78 (0x7F <= ch && ch <= 0x9F);
79}
80
7991ac34
DR
81/* count of characters used to encode one unicode char */
82static int utf8_encoded_expected_len(const char *str) {
7e8185ef 83 unsigned char c;
ba961854 84
7e8185ef
LP
85 assert(str);
86
87 c = (unsigned char) str[0];
7991ac34
DR
88 if (c < 0x80)
89 return 1;
90 if ((c & 0xe0) == 0xc0)
91 return 2;
92 if ((c & 0xf0) == 0xe0)
93 return 3;
94 if ((c & 0xf8) == 0xf0)
95 return 4;
96 if ((c & 0xfc) == 0xf8)
97 return 5;
98 if ((c & 0xfe) == 0xfc)
99 return 6;
7e8185ef 100
7991ac34
DR
101 return 0;
102}
ba961854 103
7991ac34 104/* decode one unicode char */
c932fb71
SL
105int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) {
106 char32_t unichar;
107 int len, i;
7e8185ef
LP
108
109 assert(str);
ba961854 110
7991ac34 111 len = utf8_encoded_expected_len(str);
7e8185ef 112
7991ac34
DR
113 switch (len) {
114 case 1:
c932fb71
SL
115 *ret_unichar = (char32_t)str[0];
116 return 0;
7991ac34
DR
117 case 2:
118 unichar = str[0] & 0x1f;
119 break;
120 case 3:
c932fb71 121 unichar = (char32_t)str[0] & 0x0f;
7991ac34
DR
122 break;
123 case 4:
c932fb71 124 unichar = (char32_t)str[0] & 0x07;
7991ac34
DR
125 break;
126 case 5:
c932fb71 127 unichar = (char32_t)str[0] & 0x03;
7991ac34
DR
128 break;
129 case 6:
c932fb71 130 unichar = (char32_t)str[0] & 0x01;
7991ac34
DR
131 break;
132 default:
7e8185ef 133 return -EINVAL;
ba961854
ZJS
134 }
135
7991ac34 136 for (i = 1; i < len; i++) {
c932fb71 137 if (((char32_t)str[i] & 0xc0) != 0x80)
7e8185ef 138 return -EINVAL;
7991ac34 139 unichar <<= 6;
c932fb71 140 unichar |= (char32_t)str[i] & 0x3f;
7991ac34
DR
141 }
142
c932fb71
SL
143 *ret_unichar = unichar;
144
145 return 0;
ba961854
ZJS
146}
147
0ade5ffe 148bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
6ed62be0 149 const char *p;
7f110ff9
LP
150
151 assert(str);
152
6ed62be0 153 for (p = str; length;) {
c932fb71
SL
154 int encoded_len, r;
155 char32_t val;
7e8185ef 156
6ed62be0 157 encoded_len = utf8_encoded_valid_unichar(p);
7e8185ef 158 if (encoded_len < 0 ||
144b3d9e
LP
159 (size_t) encoded_len > length)
160 return false;
161
c932fb71
SL
162 r = utf8_encoded_to_unichar(p, &val);
163 if (r < 0 ||
f3ee6297 164 unichar_is_control(val) ||
0ade5ffe 165 (!newline && val == '\n'))
7991ac34 166 return false;
7f110ff9 167
7991ac34 168 length -= encoded_len;
a7176505 169 p += encoded_len;
7f110ff9
LP
170 }
171
7991ac34 172 return true;
7f110ff9
LP
173}
174
7991ac34
DR
175const char *utf8_is_valid(const char *str) {
176 const uint8_t *p;
7f110ff9
LP
177
178 assert(str);
179
7991ac34 180 for (p = (const uint8_t*) str; *p; ) {
faaa5728
LP
181 int len;
182
183 len = utf8_encoded_valid_unichar((const char *)p);
7991ac34
DR
184 if (len < 0)
185 return NULL;
186
187 p += len;
188 }
7f110ff9 189
7991ac34 190 return str;
7f110ff9
LP
191}
192
550a40ec
ZJS
193char *utf8_escape_invalid(const char *str) {
194 char *p, *s;
195
196 assert(str);
197
198 p = s = malloc(strlen(str) * 4 + 1);
199 if (!p)
200 return NULL;
201
202 while (*str) {
203 int len;
204
205 len = utf8_encoded_valid_unichar(str);
206 if (len > 0) {
207 s = mempcpy(s, str, len);
208 str += len;
209 } else {
3c6d3052 210 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
550a40ec
ZJS
211 str += 1;
212 }
213 }
7e8185ef 214
550a40ec
ZJS
215 *s = '\0';
216
217 return p;
218}
219
fec84576
WC
220char *utf8_escape_non_printable(const char *str) {
221 char *p, *s;
222
223 assert(str);
224
225 p = s = malloc(strlen(str) * 4 + 1);
226 if (!p)
227 return NULL;
228
229 while (*str) {
230 int len;
231
232 len = utf8_encoded_valid_unichar(str);
233 if (len > 0) {
234 if (utf8_is_printable(str, len)) {
235 s = mempcpy(s, str, len);
236 str += len;
237 } else {
3c6d3052 238 while (len > 0) {
fec84576
WC
239 *(s++) = '\\';
240 *(s++) = 'x';
241 *(s++) = hexchar((int) *str >> 4);
242 *(s++) = hexchar((int) *str);
fec84576 243
3c6d3052 244 str += 1;
313cefa1 245 len--;
3c6d3052 246 }
fec84576
WC
247 }
248 } else {
3c6d3052 249 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
fec84576
WC
250 str += 1;
251 }
252 }
253
254 *s = '\0';
255
256 return p;
257}
258
7f110ff9
LP
259char *ascii_is_valid(const char *str) {
260 const char *p;
261
262 assert(str);
263
264 for (p = str; *p; p++)
265 if ((unsigned char) *p >= 128)
266 return NULL;
267
268 return (char*) str;
269}
270
2bb4c7e3
TG
271/**
272 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
273 * @out_utf8: output buffer of at least 4 bytes or NULL
274 * @g: UCS-4 character to encode
275 *
276 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
277 * The length of the character is returned. It is not zero-terminated! If the
278 * output buffer is NULL, only the length is returned.
279 *
280 * Returns: The length in bytes that the UTF-8 representation does or would
281 * occupy.
282 */
c932fb71 283size_t utf8_encode_unichar(char *out_utf8, char32_t g) {
f3ee6297 284
2bb4c7e3
TG
285 if (g < (1 << 7)) {
286 if (out_utf8)
287 out_utf8[0] = g & 0x7f;
e7eebcfc 288 return 1;
2bb4c7e3
TG
289 } else if (g < (1 << 11)) {
290 if (out_utf8) {
291 out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
292 out_utf8[1] = 0x80 | (g & 0x3f);
293 }
e7eebcfc 294 return 2;
2bb4c7e3
TG
295 } else if (g < (1 << 16)) {
296 if (out_utf8) {
297 out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
298 out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
299 out_utf8[2] = 0x80 | (g & 0x3f);
300 }
e7eebcfc 301 return 3;
2bb4c7e3
TG
302 } else if (g < (1 << 21)) {
303 if (out_utf8) {
304 out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
305 out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
306 out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
307 out_utf8[3] = 0x80 | (g & 0x3f);
308 }
309 return 4;
e7eebcfc 310 }
f3ee6297
LP
311
312 return 0;
e7eebcfc
LP
313}
314
2e3d0692 315char *utf16_to_utf8(const void *s, size_t length) {
2e3d0692 316 const uint8_t *f;
e7eebcfc 317 char *r, *t;
2e3d0692 318
04166cb7 319 r = new(char, (length * 4 + 1) / 2 + 1);
2e3d0692
LP
320 if (!r)
321 return NULL;
322
04166cb7
TG
323 f = s;
324 t = r;
325
326 while (f < (const uint8_t*) s + length) {
c932fb71 327 char16_t w1, w2;
04166cb7
TG
328
329 /* see RFC 2781 section 2.2 */
330
331 w1 = f[1] << 8 | f[0];
332 f += 2;
333
334 if (!utf16_is_surrogate(w1)) {
dcd12626 335 t += utf8_encode_unichar(t, w1);
04166cb7
TG
336
337 continue;
338 }
339
340 if (utf16_is_trailing_surrogate(w1))
341 continue;
342 else if (f >= (const uint8_t*) s + length)
343 break;
344
345 w2 = f[1] << 8 | f[0];
346 f += 2;
347
348 if (!utf16_is_trailing_surrogate(w2)) {
349 f -= 2;
350 continue;
351 }
352
353 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
354 }
2e3d0692
LP
355
356 *t = 0;
7b4d7cc0 357 return r;
2e3d0692 358}
02a36bc9 359
02a36bc9 360/* expected size used to encode one unicode char */
c932fb71 361static int utf8_unichar_to_encoded_len(char32_t unichar) {
7e8185ef 362
02a36bc9
DR
363 if (unichar < 0x80)
364 return 1;
365 if (unichar < 0x800)
366 return 2;
367 if (unichar < 0x10000)
368 return 3;
369 if (unichar < 0x200000)
370 return 4;
371 if (unichar < 0x4000000)
372 return 5;
7e8185ef 373
02a36bc9
DR
374 return 6;
375}
376
377/* validate one encoded unicode char and return its length */
378int utf8_encoded_valid_unichar(const char *str) {
c932fb71
SL
379 int len, i, r;
380 char32_t unichar;
7e8185ef
LP
381
382 assert(str);
02a36bc9
DR
383
384 len = utf8_encoded_expected_len(str);
385 if (len == 0)
7e8185ef 386 return -EINVAL;
02a36bc9
DR
387
388 /* ascii is valid */
389 if (len == 1)
390 return 1;
391
392 /* check if expected encoded chars are available */
393 for (i = 0; i < len; i++)
394 if ((str[i] & 0x80) != 0x80)
7e8185ef 395 return -EINVAL;
02a36bc9 396
c932fb71
SL
397 r = utf8_encoded_to_unichar(str, &unichar);
398 if (r < 0)
399 return r;
02a36bc9
DR
400
401 /* check if encoded length matches encoded value */
402 if (utf8_unichar_to_encoded_len(unichar) != len)
7e8185ef 403 return -EINVAL;
02a36bc9
DR
404
405 /* check if value has valid range */
f3ee6297 406 if (!unichar_is_valid(unichar))
7e8185ef 407 return -EINVAL;
02a36bc9
DR
408
409 return len;
410}