]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/utf8.c
basic: include only what we use
[thirdparty/systemd.git] / src / basic / utf8.c
CommitLineData
7f110ff9
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
036ae95a 6 Copyright 2008-2011 Kay Sievers
7f110ff9
LP
7 Copyright 2012 Lennart Poettering
8
9 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
10 under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation; either version 2.1 of the License, or
7f110ff9
LP
12 (at your option) any later version.
13
14 systemd is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 17 Lesser General Public License for more details.
7f110ff9 18
5430f7f2 19 You should have received a copy of the GNU Lesser General Public License
7f110ff9
LP
20 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21***/
22
036ae95a 23/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9
LP
24 * original license text follows. */
25
26/* gutf8.c - Operations on UTF-8 strings.
27 *
28 * Copyright (C) 1999 Tom Tromey
29 * Copyright (C) 2000 Red Hat, Inc.
30 *
31 * This library is free software; you can redistribute it and/or
23757887 32 * modify it under the terms of the GNU Library General Public
7f110ff9
LP
33 * License as published by the Free Software Foundation; either
34 * version 2 of the License, or (at your option) any later version.
35 *
36 * This library is distributed in the hope that it will be useful,
37 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887
SK
38 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
39 * Library General Public License for more details.
7f110ff9 40 *
23757887
SK
41 * You should have received a copy of the GNU Library General Public
42 * License along with this library; if not, write to the Free Software
43 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9
LP
44 */
45
46#include <errno.h>
7f110ff9 47#include <stdbool.h>
cf0fbc49
TA
48#include <stdlib.h>
49#include <string.h>
7f110ff9 50
b5efdb8a 51#include "alloc-util.h"
e4e73a63 52#include "hexdecoct.h"
11c3a366 53#include "macro.h"
7f110ff9
LP
54#include "utf8.h"
55
dcd12626 56bool unichar_is_valid(uint32_t ch) {
7f110ff9
LP
57
58 if (ch >= 0x110000) /* End of unicode space */
59 return false;
60 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
61 return false;
62 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
63 return false;
64 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
65 return false;
66
67 return true;
68}
69
dcd12626 70static bool unichar_is_control(uint32_t ch) {
ba961854
ZJS
71
72 /*
73 0 to ' '-1 is the C0 range.
74 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
75 '\t' is in C0 range, but more or less harmless and commonly used.
76 */
77
31f7bf19 78 return (ch < ' ' && ch != '\t' && ch != '\n') ||
ba961854
ZJS
79 (0x7F <= ch && ch <= 0x9F);
80}
81
7991ac34
DR
82/* count of characters used to encode one unicode char */
83static int utf8_encoded_expected_len(const char *str) {
7e8185ef 84 unsigned char c;
ba961854 85
7e8185ef
LP
86 assert(str);
87
88 c = (unsigned char) str[0];
7991ac34
DR
89 if (c < 0x80)
90 return 1;
91 if ((c & 0xe0) == 0xc0)
92 return 2;
93 if ((c & 0xf0) == 0xe0)
94 return 3;
95 if ((c & 0xf8) == 0xf0)
96 return 4;
97 if ((c & 0xfc) == 0xf8)
98 return 5;
99 if ((c & 0xfe) == 0xfc)
100 return 6;
7e8185ef 101
7991ac34
DR
102 return 0;
103}
ba961854 104
7991ac34 105/* decode one unicode char */
dcd12626
LP
106int utf8_encoded_to_unichar(const char *str) {
107 int unichar, len, i;
7e8185ef
LP
108
109 assert(str);
ba961854 110
7991ac34 111 len = utf8_encoded_expected_len(str);
7e8185ef 112
7991ac34
DR
113 switch (len) {
114 case 1:
dcd12626 115 return (int)str[0];
7991ac34
DR
116 case 2:
117 unichar = str[0] & 0x1f;
118 break;
119 case 3:
dcd12626 120 unichar = (int)str[0] & 0x0f;
7991ac34
DR
121 break;
122 case 4:
dcd12626 123 unichar = (int)str[0] & 0x07;
7991ac34
DR
124 break;
125 case 5:
dcd12626 126 unichar = (int)str[0] & 0x03;
7991ac34
DR
127 break;
128 case 6:
dcd12626 129 unichar = (int)str[0] & 0x01;
7991ac34
DR
130 break;
131 default:
7e8185ef 132 return -EINVAL;
ba961854
ZJS
133 }
134
7991ac34 135 for (i = 1; i < len; i++) {
dcd12626 136 if (((int)str[i] & 0xc0) != 0x80)
7e8185ef 137 return -EINVAL;
7991ac34 138 unichar <<= 6;
dcd12626 139 unichar |= (int)str[i] & 0x3f;
7991ac34
DR
140 }
141
142 return unichar;
ba961854
ZJS
143}
144
0ade5ffe 145bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
6ed62be0 146 const char *p;
7f110ff9
LP
147
148 assert(str);
149
6ed62be0 150 for (p = str; length;) {
dcd12626 151 int encoded_len, val;
7e8185ef 152
6ed62be0 153 encoded_len = utf8_encoded_valid_unichar(p);
7e8185ef 154 if (encoded_len < 0 ||
144b3d9e
LP
155 (size_t) encoded_len > length)
156 return false;
157
6ed62be0 158 val = utf8_encoded_to_unichar(p);
144b3d9e 159 if (val < 0 ||
f3ee6297 160 unichar_is_control(val) ||
0ade5ffe 161 (!newline && val == '\n'))
7991ac34 162 return false;
7f110ff9 163
7991ac34 164 length -= encoded_len;
a7176505 165 p += encoded_len;
7f110ff9
LP
166 }
167
7991ac34 168 return true;
7f110ff9
LP
169}
170
7991ac34
DR
171const char *utf8_is_valid(const char *str) {
172 const uint8_t *p;
7f110ff9
LP
173
174 assert(str);
175
7991ac34 176 for (p = (const uint8_t*) str; *p; ) {
faaa5728
LP
177 int len;
178
179 len = utf8_encoded_valid_unichar((const char *)p);
7991ac34
DR
180 if (len < 0)
181 return NULL;
182
183 p += len;
184 }
7f110ff9 185
7991ac34 186 return str;
7f110ff9
LP
187}
188
550a40ec
ZJS
189char *utf8_escape_invalid(const char *str) {
190 char *p, *s;
191
192 assert(str);
193
194 p = s = malloc(strlen(str) * 4 + 1);
195 if (!p)
196 return NULL;
197
198 while (*str) {
199 int len;
200
201 len = utf8_encoded_valid_unichar(str);
202 if (len > 0) {
203 s = mempcpy(s, str, len);
204 str += len;
205 } else {
3c6d3052 206 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
550a40ec
ZJS
207 str += 1;
208 }
209 }
7e8185ef 210
550a40ec
ZJS
211 *s = '\0';
212
213 return p;
214}
215
fec84576
WC
216char *utf8_escape_non_printable(const char *str) {
217 char *p, *s;
218
219 assert(str);
220
221 p = s = malloc(strlen(str) * 4 + 1);
222 if (!p)
223 return NULL;
224
225 while (*str) {
226 int len;
227
228 len = utf8_encoded_valid_unichar(str);
229 if (len > 0) {
230 if (utf8_is_printable(str, len)) {
231 s = mempcpy(s, str, len);
232 str += len;
233 } else {
3c6d3052 234 while (len > 0) {
fec84576
WC
235 *(s++) = '\\';
236 *(s++) = 'x';
237 *(s++) = hexchar((int) *str >> 4);
238 *(s++) = hexchar((int) *str);
fec84576 239
3c6d3052
LP
240 str += 1;
241 len --;
242 }
fec84576
WC
243 }
244 } else {
3c6d3052 245 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
fec84576
WC
246 str += 1;
247 }
248 }
249
250 *s = '\0';
251
252 return p;
253}
254
7f110ff9
LP
255char *ascii_is_valid(const char *str) {
256 const char *p;
257
258 assert(str);
259
260 for (p = str; *p; p++)
261 if ((unsigned char) *p >= 128)
262 return NULL;
263
264 return (char*) str;
265}
266
2bb4c7e3
TG
267/**
268 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
269 * @out_utf8: output buffer of at least 4 bytes or NULL
270 * @g: UCS-4 character to encode
271 *
272 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
273 * The length of the character is returned. It is not zero-terminated! If the
274 * output buffer is NULL, only the length is returned.
275 *
276 * Returns: The length in bytes that the UTF-8 representation does or would
277 * occupy.
278 */
dcd12626 279size_t utf8_encode_unichar(char *out_utf8, uint32_t g) {
f3ee6297 280
2bb4c7e3
TG
281 if (g < (1 << 7)) {
282 if (out_utf8)
283 out_utf8[0] = g & 0x7f;
e7eebcfc 284 return 1;
2bb4c7e3
TG
285 } else if (g < (1 << 11)) {
286 if (out_utf8) {
287 out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
288 out_utf8[1] = 0x80 | (g & 0x3f);
289 }
e7eebcfc 290 return 2;
2bb4c7e3
TG
291 } else if (g < (1 << 16)) {
292 if (out_utf8) {
293 out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
294 out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
295 out_utf8[2] = 0x80 | (g & 0x3f);
296 }
e7eebcfc 297 return 3;
2bb4c7e3
TG
298 } else if (g < (1 << 21)) {
299 if (out_utf8) {
300 out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
301 out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
302 out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
303 out_utf8[3] = 0x80 | (g & 0x3f);
304 }
305 return 4;
e7eebcfc 306 }
f3ee6297
LP
307
308 return 0;
e7eebcfc
LP
309}
310
2e3d0692 311char *utf16_to_utf8(const void *s, size_t length) {
2e3d0692 312 const uint8_t *f;
e7eebcfc 313 char *r, *t;
2e3d0692 314
04166cb7 315 r = new(char, (length * 4 + 1) / 2 + 1);
2e3d0692
LP
316 if (!r)
317 return NULL;
318
04166cb7
TG
319 f = s;
320 t = r;
321
322 while (f < (const uint8_t*) s + length) {
dcd12626 323 uint16_t w1, w2;
04166cb7
TG
324
325 /* see RFC 2781 section 2.2 */
326
327 w1 = f[1] << 8 | f[0];
328 f += 2;
329
330 if (!utf16_is_surrogate(w1)) {
dcd12626 331 t += utf8_encode_unichar(t, w1);
04166cb7
TG
332
333 continue;
334 }
335
336 if (utf16_is_trailing_surrogate(w1))
337 continue;
338 else if (f >= (const uint8_t*) s + length)
339 break;
340
341 w2 = f[1] << 8 | f[0];
342 f += 2;
343
344 if (!utf16_is_trailing_surrogate(w2)) {
345 f -= 2;
346 continue;
347 }
348
349 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
350 }
2e3d0692
LP
351
352 *t = 0;
7b4d7cc0 353 return r;
2e3d0692 354}
02a36bc9 355
02a36bc9
DR
356/* expected size used to encode one unicode char */
357static int utf8_unichar_to_encoded_len(int unichar) {
7e8185ef 358
02a36bc9
DR
359 if (unichar < 0x80)
360 return 1;
361 if (unichar < 0x800)
362 return 2;
363 if (unichar < 0x10000)
364 return 3;
365 if (unichar < 0x200000)
366 return 4;
367 if (unichar < 0x4000000)
368 return 5;
7e8185ef 369
02a36bc9
DR
370 return 6;
371}
372
373/* validate one encoded unicode char and return its length */
374int utf8_encoded_valid_unichar(const char *str) {
dcd12626 375 int len, unichar, i;
7e8185ef
LP
376
377 assert(str);
02a36bc9
DR
378
379 len = utf8_encoded_expected_len(str);
380 if (len == 0)
7e8185ef 381 return -EINVAL;
02a36bc9
DR
382
383 /* ascii is valid */
384 if (len == 1)
385 return 1;
386
387 /* check if expected encoded chars are available */
388 for (i = 0; i < len; i++)
389 if ((str[i] & 0x80) != 0x80)
7e8185ef 390 return -EINVAL;
02a36bc9
DR
391
392 unichar = utf8_encoded_to_unichar(str);
393
394 /* check if encoded length matches encoded value */
395 if (utf8_unichar_to_encoded_len(unichar) != len)
7e8185ef 396 return -EINVAL;
02a36bc9
DR
397
398 /* check if value has valid range */
f3ee6297 399 if (!unichar_is_valid(unichar))
7e8185ef 400 return -EINVAL;
02a36bc9
DR
401
402 return len;
403}