]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/utf8.c
Merge pull request #6986 from OpenDZ/tixxdz/seccomp-more-default-syscalls-v1
[thirdparty/systemd.git] / src / basic / utf8.c
CommitLineData
7f110ff9
LP
1/***
2 This file is part of systemd.
3
036ae95a 4 Copyright 2008-2011 Kay Sievers
7f110ff9
LP
5 Copyright 2012 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
7f110ff9
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
7f110ff9 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
7f110ff9
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
036ae95a 21/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9
LP
22 * original license text follows. */
23
24/* gutf8.c - Operations on UTF-8 strings.
25 *
26 * Copyright (C) 1999 Tom Tromey
27 * Copyright (C) 2000 Red Hat, Inc.
28 *
29 * This library is free software; you can redistribute it and/or
23757887 30 * modify it under the terms of the GNU Library General Public
7f110ff9
LP
31 * License as published by the Free Software Foundation; either
32 * version 2 of the License, or (at your option) any later version.
33 *
34 * This library is distributed in the hope that it will be useful,
35 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887
SK
36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
37 * Library General Public License for more details.
7f110ff9 38 *
23757887
SK
39 * You should have received a copy of the GNU Library General Public
40 * License along with this library; if not, write to the Free Software
41 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9
LP
42 */
43
44#include <errno.h>
7f110ff9 45#include <stdbool.h>
cf0fbc49
TA
46#include <stdlib.h>
47#include <string.h>
7f110ff9 48
b5efdb8a 49#include "alloc-util.h"
e4e73a63 50#include "hexdecoct.h"
11c3a366 51#include "macro.h"
7f110ff9
LP
52#include "utf8.h"
53
c932fb71 54bool unichar_is_valid(char32_t ch) {
7f110ff9
LP
55
56 if (ch >= 0x110000) /* End of unicode space */
57 return false;
58 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
59 return false;
60 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
61 return false;
62 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
63 return false;
64
65 return true;
66}
67
c932fb71 68static bool unichar_is_control(char32_t ch) {
ba961854
ZJS
69
70 /*
71 0 to ' '-1 is the C0 range.
72 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
73 '\t' is in C0 range, but more or less harmless and commonly used.
74 */
75
31f7bf19 76 return (ch < ' ' && ch != '\t' && ch != '\n') ||
ba961854
ZJS
77 (0x7F <= ch && ch <= 0x9F);
78}
79
7991ac34
DR
80/* count of characters used to encode one unicode char */
81static int utf8_encoded_expected_len(const char *str) {
7e8185ef 82 unsigned char c;
ba961854 83
7e8185ef
LP
84 assert(str);
85
86 c = (unsigned char) str[0];
7991ac34
DR
87 if (c < 0x80)
88 return 1;
89 if ((c & 0xe0) == 0xc0)
90 return 2;
91 if ((c & 0xf0) == 0xe0)
92 return 3;
93 if ((c & 0xf8) == 0xf0)
94 return 4;
95 if ((c & 0xfc) == 0xf8)
96 return 5;
97 if ((c & 0xfe) == 0xfc)
98 return 6;
7e8185ef 99
7991ac34
DR
100 return 0;
101}
ba961854 102
7991ac34 103/* decode one unicode char */
c932fb71
SL
104int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar) {
105 char32_t unichar;
106 int len, i;
7e8185ef
LP
107
108 assert(str);
ba961854 109
7991ac34 110 len = utf8_encoded_expected_len(str);
7e8185ef 111
7991ac34
DR
112 switch (len) {
113 case 1:
c932fb71
SL
114 *ret_unichar = (char32_t)str[0];
115 return 0;
7991ac34
DR
116 case 2:
117 unichar = str[0] & 0x1f;
118 break;
119 case 3:
c932fb71 120 unichar = (char32_t)str[0] & 0x0f;
7991ac34
DR
121 break;
122 case 4:
c932fb71 123 unichar = (char32_t)str[0] & 0x07;
7991ac34
DR
124 break;
125 case 5:
c932fb71 126 unichar = (char32_t)str[0] & 0x03;
7991ac34
DR
127 break;
128 case 6:
c932fb71 129 unichar = (char32_t)str[0] & 0x01;
7991ac34
DR
130 break;
131 default:
7e8185ef 132 return -EINVAL;
ba961854
ZJS
133 }
134
7991ac34 135 for (i = 1; i < len; i++) {
c932fb71 136 if (((char32_t)str[i] & 0xc0) != 0x80)
7e8185ef 137 return -EINVAL;
7991ac34 138 unichar <<= 6;
c932fb71 139 unichar |= (char32_t)str[i] & 0x3f;
7991ac34
DR
140 }
141
c932fb71
SL
142 *ret_unichar = unichar;
143
144 return 0;
ba961854
ZJS
145}
146
0ade5ffe 147bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
6ed62be0 148 const char *p;
7f110ff9
LP
149
150 assert(str);
151
6ed62be0 152 for (p = str; length;) {
c932fb71
SL
153 int encoded_len, r;
154 char32_t val;
7e8185ef 155
6ed62be0 156 encoded_len = utf8_encoded_valid_unichar(p);
7e8185ef 157 if (encoded_len < 0 ||
144b3d9e
LP
158 (size_t) encoded_len > length)
159 return false;
160
c932fb71
SL
161 r = utf8_encoded_to_unichar(p, &val);
162 if (r < 0 ||
f3ee6297 163 unichar_is_control(val) ||
0ade5ffe 164 (!newline && val == '\n'))
7991ac34 165 return false;
7f110ff9 166
7991ac34 167 length -= encoded_len;
a7176505 168 p += encoded_len;
7f110ff9
LP
169 }
170
7991ac34 171 return true;
7f110ff9
LP
172}
173
7991ac34
DR
174const char *utf8_is_valid(const char *str) {
175 const uint8_t *p;
7f110ff9
LP
176
177 assert(str);
178
7991ac34 179 for (p = (const uint8_t*) str; *p; ) {
faaa5728
LP
180 int len;
181
182 len = utf8_encoded_valid_unichar((const char *)p);
7991ac34
DR
183 if (len < 0)
184 return NULL;
185
186 p += len;
187 }
7f110ff9 188
7991ac34 189 return str;
7f110ff9
LP
190}
191
550a40ec
ZJS
192char *utf8_escape_invalid(const char *str) {
193 char *p, *s;
194
195 assert(str);
196
197 p = s = malloc(strlen(str) * 4 + 1);
198 if (!p)
199 return NULL;
200
201 while (*str) {
202 int len;
203
204 len = utf8_encoded_valid_unichar(str);
205 if (len > 0) {
206 s = mempcpy(s, str, len);
207 str += len;
208 } else {
3c6d3052 209 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
550a40ec
ZJS
210 str += 1;
211 }
212 }
7e8185ef 213
550a40ec
ZJS
214 *s = '\0';
215
216 return p;
217}
218
fec84576
WC
219char *utf8_escape_non_printable(const char *str) {
220 char *p, *s;
221
222 assert(str);
223
224 p = s = malloc(strlen(str) * 4 + 1);
225 if (!p)
226 return NULL;
227
228 while (*str) {
229 int len;
230
231 len = utf8_encoded_valid_unichar(str);
232 if (len > 0) {
233 if (utf8_is_printable(str, len)) {
234 s = mempcpy(s, str, len);
235 str += len;
236 } else {
3c6d3052 237 while (len > 0) {
fec84576
WC
238 *(s++) = '\\';
239 *(s++) = 'x';
240 *(s++) = hexchar((int) *str >> 4);
241 *(s++) = hexchar((int) *str);
fec84576 242
3c6d3052 243 str += 1;
313cefa1 244 len--;
3c6d3052 245 }
fec84576
WC
246 }
247 } else {
3c6d3052 248 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
fec84576
WC
249 str += 1;
250 }
251 }
252
253 *s = '\0';
254
255 return p;
256}
257
7f110ff9
LP
258char *ascii_is_valid(const char *str) {
259 const char *p;
260
261 assert(str);
262
263 for (p = str; *p; p++)
264 if ((unsigned char) *p >= 128)
265 return NULL;
266
267 return (char*) str;
268}
269
2bb4c7e3
TG
270/**
271 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
272 * @out_utf8: output buffer of at least 4 bytes or NULL
273 * @g: UCS-4 character to encode
274 *
275 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
276 * The length of the character is returned. It is not zero-terminated! If the
277 * output buffer is NULL, only the length is returned.
278 *
279 * Returns: The length in bytes that the UTF-8 representation does or would
280 * occupy.
281 */
c932fb71 282size_t utf8_encode_unichar(char *out_utf8, char32_t g) {
f3ee6297 283
2bb4c7e3
TG
284 if (g < (1 << 7)) {
285 if (out_utf8)
286 out_utf8[0] = g & 0x7f;
e7eebcfc 287 return 1;
2bb4c7e3
TG
288 } else if (g < (1 << 11)) {
289 if (out_utf8) {
290 out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
291 out_utf8[1] = 0x80 | (g & 0x3f);
292 }
e7eebcfc 293 return 2;
2bb4c7e3
TG
294 } else if (g < (1 << 16)) {
295 if (out_utf8) {
296 out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
297 out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
298 out_utf8[2] = 0x80 | (g & 0x3f);
299 }
e7eebcfc 300 return 3;
2bb4c7e3
TG
301 } else if (g < (1 << 21)) {
302 if (out_utf8) {
303 out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
304 out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
305 out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
306 out_utf8[3] = 0x80 | (g & 0x3f);
307 }
308 return 4;
e7eebcfc 309 }
f3ee6297
LP
310
311 return 0;
e7eebcfc
LP
312}
313
2e3d0692 314char *utf16_to_utf8(const void *s, size_t length) {
2e3d0692 315 const uint8_t *f;
e7eebcfc 316 char *r, *t;
2e3d0692 317
04166cb7 318 r = new(char, (length * 4 + 1) / 2 + 1);
2e3d0692
LP
319 if (!r)
320 return NULL;
321
04166cb7
TG
322 f = s;
323 t = r;
324
325 while (f < (const uint8_t*) s + length) {
c932fb71 326 char16_t w1, w2;
04166cb7
TG
327
328 /* see RFC 2781 section 2.2 */
329
330 w1 = f[1] << 8 | f[0];
331 f += 2;
332
333 if (!utf16_is_surrogate(w1)) {
dcd12626 334 t += utf8_encode_unichar(t, w1);
04166cb7
TG
335
336 continue;
337 }
338
339 if (utf16_is_trailing_surrogate(w1))
340 continue;
341 else if (f >= (const uint8_t*) s + length)
342 break;
343
344 w2 = f[1] << 8 | f[0];
345 f += 2;
346
347 if (!utf16_is_trailing_surrogate(w2)) {
348 f -= 2;
349 continue;
350 }
351
352 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
353 }
2e3d0692
LP
354
355 *t = 0;
7b4d7cc0 356 return r;
2e3d0692 357}
02a36bc9 358
02a36bc9 359/* expected size used to encode one unicode char */
c932fb71 360static int utf8_unichar_to_encoded_len(char32_t unichar) {
7e8185ef 361
02a36bc9
DR
362 if (unichar < 0x80)
363 return 1;
364 if (unichar < 0x800)
365 return 2;
366 if (unichar < 0x10000)
367 return 3;
368 if (unichar < 0x200000)
369 return 4;
370 if (unichar < 0x4000000)
371 return 5;
7e8185ef 372
02a36bc9
DR
373 return 6;
374}
375
376/* validate one encoded unicode char and return its length */
377int utf8_encoded_valid_unichar(const char *str) {
c932fb71
SL
378 int len, i, r;
379 char32_t unichar;
7e8185ef
LP
380
381 assert(str);
02a36bc9
DR
382
383 len = utf8_encoded_expected_len(str);
384 if (len == 0)
7e8185ef 385 return -EINVAL;
02a36bc9
DR
386
387 /* ascii is valid */
388 if (len == 1)
389 return 1;
390
391 /* check if expected encoded chars are available */
392 for (i = 0; i < len; i++)
393 if ((str[i] & 0x80) != 0x80)
7e8185ef 394 return -EINVAL;
02a36bc9 395
c932fb71
SL
396 r = utf8_encoded_to_unichar(str, &unichar);
397 if (r < 0)
398 return r;
02a36bc9
DR
399
400 /* check if encoded length matches encoded value */
401 if (utf8_unichar_to_encoded_len(unichar) != len)
7e8185ef 402 return -EINVAL;
02a36bc9
DR
403
404 /* check if value has valid range */
f3ee6297 405 if (!unichar_is_valid(unichar))
7e8185ef 406 return -EINVAL;
02a36bc9
DR
407
408 return len;
409}