]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/utf8.c
Merge pull request #32942 from yuwata/test-journal-sync-more
[thirdparty/systemd.git] / src / basic / utf8.c
CommitLineData
6952ebae 1/* SPDX-License-Identifier: LGPL-2.0-or-later */
7f110ff9 2
6952ebae 3/* Parts of this file are based on the GLIB utf8 validation functions. The original copyright follows.
7f110ff9 4 *
6952ebae 5 * gutf8.c - Operations on UTF-8 strings.
7f110ff9
LP
6 * Copyright (C) 1999 Tom Tromey
7 * Copyright (C) 2000 Red Hat, Inc.
7f110ff9
LP
8 */
9
10#include <errno.h>
7f110ff9 11#include <stdbool.h>
cf0fbc49 12#include <stdlib.h>
7f110ff9 13
b5efdb8a 14#include "alloc-util.h"
3f536d5b 15#include "gunicode.h"
e4e73a63 16#include "hexdecoct.h"
11c3a366 17#include "macro.h"
4c6d5139 18#include "string-util.h"
7f110ff9
LP
19#include "utf8.h"
20
c932fb71 21bool unichar_is_valid(char32_t ch) {
7f110ff9
LP
22
23 if (ch >= 0x110000) /* End of unicode space */
24 return false;
25 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
26 return false;
27 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
28 return false;
29 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
30 return false;
31
32 return true;
33}
34
c932fb71 35static bool unichar_is_control(char32_t ch) {
ba961854
ZJS
36
37 /*
38 0 to ' '-1 is the C0 range.
39 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
40 '\t' is in C0 range, but more or less harmless and commonly used.
41 */
42
4c701096 43 return (ch < ' ' && !IN_SET(ch, '\t', '\n')) ||
ba961854
ZJS
44 (0x7F <= ch && ch <= 0x9F);
45}
46
7991ac34 47/* count of characters used to encode one unicode char */
84319aa7 48static size_t utf8_encoded_expected_len(uint8_t c) {
7991ac34
DR
49 if (c < 0x80)
50 return 1;
51 if ((c & 0xe0) == 0xc0)
52 return 2;
53 if ((c & 0xf0) == 0xe0)
54 return 3;
55 if ((c & 0xf8) == 0xf0)
56 return 4;
57 if ((c & 0xfc) == 0xf8)
58 return 5;
59 if ((c & 0xfe) == 0xfc)
60 return 6;
7e8185ef 61
7991ac34
DR
62 return 0;
63}
ba961854 64
7991ac34 65/* decode one unicode char */
c932fb71
SL
66int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) {
67 char32_t unichar;
fe96c0f8 68 size_t len;
7e8185ef
LP
69
70 assert(str);
ba961854 71
84319aa7 72 len = utf8_encoded_expected_len(str[0]);
7e8185ef 73
7991ac34
DR
74 switch (len) {
75 case 1:
c932fb71 76 *ret_unichar = (char32_t)str[0];
9579e9a5 77 return 1;
7991ac34
DR
78 case 2:
79 unichar = str[0] & 0x1f;
80 break;
81 case 3:
c932fb71 82 unichar = (char32_t)str[0] & 0x0f;
7991ac34
DR
83 break;
84 case 4:
c932fb71 85 unichar = (char32_t)str[0] & 0x07;
7991ac34
DR
86 break;
87 case 5:
c932fb71 88 unichar = (char32_t)str[0] & 0x03;
7991ac34
DR
89 break;
90 case 6:
c932fb71 91 unichar = (char32_t)str[0] & 0x01;
7991ac34
DR
92 break;
93 default:
7e8185ef 94 return -EINVAL;
ba961854
ZJS
95 }
96
fe96c0f8 97 for (size_t i = 1; i < len; i++) {
c932fb71 98 if (((char32_t)str[i] & 0xc0) != 0x80)
7e8185ef 99 return -EINVAL;
7c421857 100
7991ac34 101 unichar <<= 6;
c932fb71 102 unichar |= (char32_t)str[i] & 0x3f;
7991ac34
DR
103 }
104
c932fb71 105 *ret_unichar = unichar;
9579e9a5 106 return len;
ba961854
ZJS
107}
108
618727da 109bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newline) {
7f110ff9
LP
110 assert(str);
111
a456086b 112 for (const char *p = str; length > 0;) {
9579e9a5 113 int encoded_len;
c932fb71 114 char32_t val;
7e8185ef 115
92e068b4
ZJS
116 encoded_len = utf8_encoded_valid_unichar(p, length);
117 if (encoded_len < 0)
144b3d9e 118 return false;
92e068b4 119 assert(encoded_len > 0 && (size_t) encoded_len <= length);
144b3d9e 120
9579e9a5 121 if (utf8_encoded_to_unichar(p, &val) < 0 ||
f3ee6297 122 unichar_is_control(val) ||
618727da 123 (!allow_newline && val == '\n'))
7991ac34 124 return false;
7f110ff9 125
7991ac34 126 length -= encoded_len;
a7176505 127 p += encoded_len;
7f110ff9
LP
128 }
129
7991ac34 130 return true;
7f110ff9
LP
131}
132
80ab31a4
ZJS
133char *utf8_is_valid_n(const char *str, size_t len_bytes) {
134 /* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after
135 * len_bytes. Otherwise, stop at NUL. */
7f110ff9
LP
136
137 assert(str);
138
f5fbe71d 139 for (const char *p = str; len_bytes != SIZE_MAX ? (size_t) (p - str) < len_bytes : *p != '\0'; ) {
faaa5728
LP
140 int len;
141
f5fbe71d 142 if (_unlikely_(*p == '\0') && len_bytes != SIZE_MAX)
80ab31a4
ZJS
143 return NULL; /* embedded NUL */
144
145 len = utf8_encoded_valid_unichar(p,
f5fbe71d 146 len_bytes != SIZE_MAX ? len_bytes - (p - str) : SIZE_MAX);
80ab31a4
ZJS
147 if (_unlikely_(len < 0))
148 return NULL; /* invalid character */
7991ac34
DR
149
150 p += len;
151 }
7f110ff9 152
e71fb4b3 153 return (char*) str;
7f110ff9
LP
154}
155
550a40ec
ZJS
156char *utf8_escape_invalid(const char *str) {
157 char *p, *s;
158
159 assert(str);
160
161 p = s = malloc(strlen(str) * 4 + 1);
162 if (!p)
163 return NULL;
164
165 while (*str) {
166 int len;
167
f5fbe71d 168 len = utf8_encoded_valid_unichar(str, SIZE_MAX);
550a40ec
ZJS
169 if (len > 0) {
170 s = mempcpy(s, str, len);
171 str += len;
172 } else {
3c6d3052 173 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
550a40ec
ZJS
174 str += 1;
175 }
176 }
7e8185ef 177
550a40ec 178 *s = '\0';
523e1b14 179 return str_realloc(p);
550a40ec
ZJS
180}
181
da88f542
ZJS
182static int utf8_char_console_width(const char *str) {
183 char32_t c;
184 int r;
185
186 r = utf8_encoded_to_unichar(str, &c);
187 if (r < 0)
188 return r;
189
190 /* TODO: we should detect combining characters */
191
192 return unichar_iswide(c) ? 2 : 1;
193}
194
fc96e5c0 195char *utf8_escape_non_printable_full(const char *str, size_t console_width, bool force_ellipsis) {
da88f542
ZJS
196 char *p, *s, *prev_s;
197 size_t n = 0; /* estimated print width */
fec84576
WC
198
199 assert(str);
200
da88f542
ZJS
201 if (console_width == 0)
202 return strdup("");
203
204 p = s = prev_s = malloc(strlen(str) * 4 + 1);
fec84576
WC
205 if (!p)
206 return NULL;
207
da88f542 208 for (;;) {
fec84576 209 int len;
da88f542
ZJS
210 char *saved_s = s;
211
fc96e5c0
ZJS
212 if (!*str) { /* done! */
213 if (force_ellipsis)
214 goto truncation;
215 else
216 goto finish;
217 }
fec84576 218
f5fbe71d 219 len = utf8_encoded_valid_unichar(str, SIZE_MAX);
fec84576
WC
220 if (len > 0) {
221 if (utf8_is_printable(str, len)) {
da88f542
ZJS
222 int w;
223
224 w = utf8_char_console_width(str);
225 assert(w >= 0);
226 if (n + w > console_width)
227 goto truncation;
228
fec84576
WC
229 s = mempcpy(s, str, len);
230 str += len;
da88f542
ZJS
231 n += w;
232
fec84576 233 } else {
da88f542
ZJS
234 for (; len > 0; len--) {
235 if (n + 4 > console_width)
236 goto truncation;
237
fec84576
WC
238 *(s++) = '\\';
239 *(s++) = 'x';
240 *(s++) = hexchar((int) *str >> 4);
241 *(s++) = hexchar((int) *str);
fec84576 242
3c6d3052 243 str += 1;
da88f542 244 n += 4;
3c6d3052 245 }
fec84576
WC
246 }
247 } else {
da88f542
ZJS
248 if (n + 1 > console_width)
249 goto truncation;
250
251 s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER));
fec84576 252 str += 1;
da88f542 253 n += 1;
fec84576 254 }
da88f542
ZJS
255
256 prev_s = saved_s;
fec84576
WC
257 }
258
da88f542
ZJS
259 truncation:
260 /* Try to go back one if we don't have enough space for the ellipsis */
fc96e5c0 261 if (n + 1 > console_width)
da88f542
ZJS
262 s = prev_s;
263
264 s = mempcpy(s, "…", strlen("…"));
fec84576 265
da88f542
ZJS
266 finish:
267 *s = '\0';
523e1b14 268 return str_realloc(p);
fec84576
WC
269}
270
7f110ff9 271char *ascii_is_valid(const char *str) {
294a3121
ZJS
272 /* Check whether the string consists of valid ASCII bytes,
273 * i.e values between 0 and 127, inclusive. */
274
7f110ff9
LP
275 assert(str);
276
a456086b 277 for (const char *p = str; *p; p++)
7f110ff9
LP
278 if ((unsigned char) *p >= 128)
279 return NULL;
280
281 return (char*) str;
282}
283
294a3121 284char *ascii_is_valid_n(const char *str, size_t len) {
294a3121
ZJS
285 /* Very similar to ascii_is_valid(), but checks exactly len
286 * bytes and rejects any NULs in that range. */
287
288 assert(str);
289
fe96c0f8 290 for (size_t i = 0; i < len; i++)
294a3121
ZJS
291 if ((unsigned char) str[i] >= 128 || str[i] == 0)
292 return NULL;
293
294 return (char*) str;
295}
296
9b49a3b4
ZJS
297int utf8_to_ascii(const char *str, char replacement_char, char **ret) {
298 /* Convert to a string that has only ASCII chars, replacing anything that is not ASCII
299 * by replacement_char. */
300
301 _cleanup_free_ char *ans = new(char, strlen(str) + 1);
302 if (!ans)
303 return -ENOMEM;
304
305 char *q = ans;
306
307 for (const char *p = str; *p; q++) {
308 int l;
309
310 l = utf8_encoded_valid_unichar(p, SIZE_MAX);
311 if (l < 0) /* Non-UTF-8, let's not even try to propagate the garbage */
312 return l;
313
314 if (l == 1)
315 *q = *p;
316 else
317 /* non-ASCII, we need to replace it */
318 *q = replacement_char;
319
320 p += l;
321 }
322 *q = '\0';
323
324 *ret = TAKE_PTR(ans);
325 return 0;
326}
327
2bb4c7e3
TG
328/**
329 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
330 * @out_utf8: output buffer of at least 4 bytes or NULL
331 * @g: UCS-4 character to encode
332 *
333 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
334 * The length of the character is returned. It is not zero-terminated! If the
335 * output buffer is NULL, only the length is returned.
336 *
337 * Returns: The length in bytes that the UTF-8 representation does or would
338 * occupy.
339 */
c932fb71 340size_t utf8_encode_unichar(char *out_utf8, char32_t g) {
f3ee6297 341
2bb4c7e3
TG
342 if (g < (1 << 7)) {
343 if (out_utf8)
344 out_utf8[0] = g & 0x7f;
e7eebcfc 345 return 1;
2bb4c7e3
TG
346 } else if (g < (1 << 11)) {
347 if (out_utf8) {
348 out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
349 out_utf8[1] = 0x80 | (g & 0x3f);
350 }
e7eebcfc 351 return 2;
2bb4c7e3
TG
352 } else if (g < (1 << 16)) {
353 if (out_utf8) {
354 out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
355 out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
356 out_utf8[2] = 0x80 | (g & 0x3f);
357 }
e7eebcfc 358 return 3;
2bb4c7e3
TG
359 } else if (g < (1 << 21)) {
360 if (out_utf8) {
361 out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
362 out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
363 out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
364 out_utf8[3] = 0x80 | (g & 0x3f);
365 }
366 return 4;
e7eebcfc 367 }
f3ee6297
LP
368
369 return 0;
e7eebcfc
LP
370}
371
2ac2ff3f 372char *utf16_to_utf8(const char16_t *s, size_t length /* bytes! */) {
2e3d0692 373 const uint8_t *f;
e7eebcfc 374 char *r, *t;
2e3d0692 375
ba091282
LP
376 if (length == 0)
377 return new0(char, 1);
378
2ac2ff3f
LP
379 assert(s);
380
ba091282
LP
381 if (length == SIZE_MAX) {
382 length = char16_strlen(s);
383
384 if (length > SIZE_MAX/2)
385 return NULL; /* overflow */
386
387 length *= 2;
388 }
389
2ac2ff3f
LP
390 /* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may
391 * take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */
ba091282 392 if (length > (SIZE_MAX - 1) / 2)
2ac2ff3f
LP
393 return NULL; /* overflow */
394
395 r = new(char, length * 2 + 1);
2e3d0692
LP
396 if (!r)
397 return NULL;
398
2ac2ff3f 399 f = (const uint8_t*) s;
04166cb7
TG
400 t = r;
401
2ac2ff3f 402 while (f + 1 < (const uint8_t*) s + length) {
c932fb71 403 char16_t w1, w2;
04166cb7
TG
404
405 /* see RFC 2781 section 2.2 */
406
407 w1 = f[1] << 8 | f[0];
408 f += 2;
409
410 if (!utf16_is_surrogate(w1)) {
dcd12626 411 t += utf8_encode_unichar(t, w1);
04166cb7
TG
412 continue;
413 }
414
415 if (utf16_is_trailing_surrogate(w1))
2ac2ff3f
LP
416 continue; /* spurious trailing surrogate, ignore */
417
418 if (f + 1 >= (const uint8_t*) s + length)
04166cb7
TG
419 break;
420
421 w2 = f[1] << 8 | f[0];
422 f += 2;
423
424 if (!utf16_is_trailing_surrogate(w2)) {
425 f -= 2;
2ac2ff3f 426 continue; /* surrogate missing its trailing surrogate, ignore */
04166cb7
TG
427 }
428
429 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
430 }
2e3d0692
LP
431
432 *t = 0;
7b4d7cc0 433 return r;
2e3d0692 434}
02a36bc9 435
80b0a597
LP
436size_t utf16_encode_unichar(char16_t *out, char32_t c) {
437
438 /* Note that this encodes as little-endian. */
439
440 switch (c) {
441
442 case 0 ... 0xd7ffU:
443 case 0xe000U ... 0xffffU:
444 out[0] = htole16(c);
445 return 1;
446
447 case 0x10000U ... 0x10ffffU:
448 c -= 0x10000U;
449 out[0] = htole16((c >> 10) + 0xd800U);
450 out[1] = htole16((c & 0x3ffU) + 0xdc00U);
451 return 2;
452
453 default: /* A surrogate (invalid) */
454 return 0;
455 }
456}
457
458char16_t *utf8_to_utf16(const char *s, size_t length) {
459 char16_t *n, *p;
80b0a597
LP
460 int r;
461
ba091282
LP
462 if (length == 0)
463 return new0(char16_t, 1);
464
80b0a597
LP
465 assert(s);
466
ba091282
LP
467 if (length == SIZE_MAX)
468 length = strlen(s);
469
470 if (length > SIZE_MAX - 1)
471 return NULL; /* overflow */
472
80b0a597
LP
473 n = new(char16_t, length + 1);
474 if (!n)
475 return NULL;
476
477 p = n;
478
fe96c0f8 479 for (size_t i = 0; i < length;) {
80b0a597
LP
480 char32_t unichar;
481 size_t e;
482
84319aa7 483 e = utf8_encoded_expected_len(s[i]);
80b0a597
LP
484 if (e <= 1) /* Invalid and single byte characters are copied as they are */
485 goto copy;
486
487 if (i + e > length) /* sequence longer than input buffer, then copy as-is */
488 goto copy;
489
490 r = utf8_encoded_to_unichar(s + i, &unichar);
491 if (r < 0) /* sequence invalid, then copy as-is */
492 goto copy;
493
494 p += utf16_encode_unichar(p, unichar);
495 i += e;
496 continue;
497
498 copy:
499 *(p++) = htole16(s[i++]);
500 }
501
502 *p = 0;
503 return n;
504}
505
506size_t char16_strlen(const char16_t *s) {
507 size_t n = 0;
508
509 assert(s);
510
511 while (*s != 0)
512 n++, s++;
513
514 return n;
515}
516
02a36bc9 517/* expected size used to encode one unicode char */
c932fb71 518static int utf8_unichar_to_encoded_len(char32_t unichar) {
7e8185ef 519
02a36bc9
DR
520 if (unichar < 0x80)
521 return 1;
522 if (unichar < 0x800)
523 return 2;
524 if (unichar < 0x10000)
525 return 3;
526 if (unichar < 0x200000)
527 return 4;
528 if (unichar < 0x4000000)
529 return 5;
7e8185ef 530
02a36bc9
DR
531 return 6;
532}
533
534/* validate one encoded unicode char and return its length */
92e068b4 535int utf8_encoded_valid_unichar(const char *str, size_t length /* bytes */) {
c932fb71 536 char32_t unichar;
fe96c0f8 537 size_t len;
7c421857 538 int r;
7e8185ef
LP
539
540 assert(str);
92e068b4
ZJS
541 assert(length > 0);
542
f5fbe71d 543 /* We read until NUL, at most length bytes. SIZE_MAX may be used to disable the length check. */
02a36bc9 544
84319aa7 545 len = utf8_encoded_expected_len(str[0]);
02a36bc9 546 if (len == 0)
7e8185ef 547 return -EINVAL;
02a36bc9 548
92e068b4
ZJS
549 /* Do we have a truncated multi-byte character? */
550 if (len > length)
551 return -EINVAL;
552
02a36bc9
DR
553 /* ascii is valid */
554 if (len == 1)
555 return 1;
556
557 /* check if expected encoded chars are available */
fe96c0f8 558 for (size_t i = 0; i < len; i++)
02a36bc9 559 if ((str[i] & 0x80) != 0x80)
7e8185ef 560 return -EINVAL;
02a36bc9 561
c932fb71
SL
562 r = utf8_encoded_to_unichar(str, &unichar);
563 if (r < 0)
564 return r;
02a36bc9
DR
565
566 /* check if encoded length matches encoded value */
7c421857 567 if (utf8_unichar_to_encoded_len(unichar) != (int) len)
7e8185ef 568 return -EINVAL;
02a36bc9
DR
569
570 /* check if value has valid range */
f3ee6297 571 if (!unichar_is_valid(unichar))
7e8185ef 572 return -EINVAL;
02a36bc9 573
7c421857 574 return (int) len;
02a36bc9 575}
65ee8660
LP
576
577size_t utf8_n_codepoints(const char *str) {
578 size_t n = 0;
579
f5fbe71d 580 /* Returns the number of UTF-8 codepoints in this string, or SIZE_MAX if the string is not valid UTF-8. */
65ee8660
LP
581
582 while (*str != 0) {
583 int k;
584
f5fbe71d 585 k = utf8_encoded_valid_unichar(str, SIZE_MAX);
65ee8660 586 if (k < 0)
f5fbe71d 587 return SIZE_MAX;
65ee8660
LP
588
589 str += k;
590 n++;
591 }
592
593 return n;
594}
3f536d5b
LP
595
596size_t utf8_console_width(const char *str) {
597 size_t n = 0;
598
599 /* Returns the approximate width a string will take on screen when printed on a character cell
600 * terminal/console. */
601
da88f542
ZJS
602 while (*str) {
603 int w;
3f536d5b 604
da88f542
ZJS
605 w = utf8_char_console_width(str);
606 if (w < 0)
f5fbe71d 607 return SIZE_MAX;
3f536d5b 608
da88f542 609 n += w;
3f536d5b 610 str = utf8_next_char(str);
3f536d5b
LP
611 }
612
613 return n;
614}