]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/basic/utf8.c
Merge pull request #1923 from zonque/siphash
[thirdparty/systemd.git] / src / basic / utf8.c
CommitLineData
7f110ff9
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
036ae95a 6 Copyright 2008-2011 Kay Sievers
7f110ff9
LP
7 Copyright 2012 Lennart Poettering
8
9 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
10 under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation; either version 2.1 of the License, or
7f110ff9
LP
12 (at your option) any later version.
13
14 systemd is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 17 Lesser General Public License for more details.
7f110ff9 18
5430f7f2 19 You should have received a copy of the GNU Lesser General Public License
7f110ff9
LP
20 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21***/
22
036ae95a 23/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9
LP
24 * original license text follows. */
25
26/* gutf8.c - Operations on UTF-8 strings.
27 *
28 * Copyright (C) 1999 Tom Tromey
29 * Copyright (C) 2000 Red Hat, Inc.
30 *
31 * This library is free software; you can redistribute it and/or
23757887 32 * modify it under the terms of the GNU Library General Public
7f110ff9
LP
33 * License as published by the Free Software Foundation; either
34 * version 2 of the License, or (at your option) any later version.
35 *
36 * This library is distributed in the hope that it will be useful,
37 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887
SK
38 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
39 * Library General Public License for more details.
7f110ff9 40 *
23757887
SK
41 * You should have received a copy of the GNU Library General Public
42 * License along with this library; if not, write to the Free Software
43 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9
LP
44 */
45
46#include <errno.h>
7f110ff9 47#include <inttypes.h>
7f110ff9 48#include <stdbool.h>
cf0fbc49
TA
49#include <stdlib.h>
50#include <string.h>
7f110ff9 51
b5efdb8a 52#include "alloc-util.h"
e4e73a63 53#include "hexdecoct.h"
7f110ff9 54#include "utf8.h"
2e3d0692 55#include "util.h"
7f110ff9 56
dcd12626 57bool unichar_is_valid(uint32_t ch) {
7f110ff9
LP
58
59 if (ch >= 0x110000) /* End of unicode space */
60 return false;
61 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
62 return false;
63 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
64 return false;
65 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
66 return false;
67
68 return true;
69}
70
dcd12626 71static bool unichar_is_control(uint32_t ch) {
ba961854
ZJS
72
73 /*
74 0 to ' '-1 is the C0 range.
75 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
76 '\t' is in C0 range, but more or less harmless and commonly used.
77 */
78
31f7bf19 79 return (ch < ' ' && ch != '\t' && ch != '\n') ||
ba961854
ZJS
80 (0x7F <= ch && ch <= 0x9F);
81}
82
7991ac34
DR
83/* count of characters used to encode one unicode char */
84static int utf8_encoded_expected_len(const char *str) {
7e8185ef 85 unsigned char c;
ba961854 86
7e8185ef
LP
87 assert(str);
88
89 c = (unsigned char) str[0];
7991ac34
DR
90 if (c < 0x80)
91 return 1;
92 if ((c & 0xe0) == 0xc0)
93 return 2;
94 if ((c & 0xf0) == 0xe0)
95 return 3;
96 if ((c & 0xf8) == 0xf0)
97 return 4;
98 if ((c & 0xfc) == 0xf8)
99 return 5;
100 if ((c & 0xfe) == 0xfc)
101 return 6;
7e8185ef 102
7991ac34
DR
103 return 0;
104}
ba961854 105
7991ac34 106/* decode one unicode char */
dcd12626
LP
107int utf8_encoded_to_unichar(const char *str) {
108 int unichar, len, i;
7e8185ef
LP
109
110 assert(str);
ba961854 111
7991ac34 112 len = utf8_encoded_expected_len(str);
7e8185ef 113
7991ac34
DR
114 switch (len) {
115 case 1:
dcd12626 116 return (int)str[0];
7991ac34
DR
117 case 2:
118 unichar = str[0] & 0x1f;
119 break;
120 case 3:
dcd12626 121 unichar = (int)str[0] & 0x0f;
7991ac34
DR
122 break;
123 case 4:
dcd12626 124 unichar = (int)str[0] & 0x07;
7991ac34
DR
125 break;
126 case 5:
dcd12626 127 unichar = (int)str[0] & 0x03;
7991ac34
DR
128 break;
129 case 6:
dcd12626 130 unichar = (int)str[0] & 0x01;
7991ac34
DR
131 break;
132 default:
7e8185ef 133 return -EINVAL;
ba961854
ZJS
134 }
135
7991ac34 136 for (i = 1; i < len; i++) {
dcd12626 137 if (((int)str[i] & 0xc0) != 0x80)
7e8185ef 138 return -EINVAL;
7991ac34 139 unichar <<= 6;
dcd12626 140 unichar |= (int)str[i] & 0x3f;
7991ac34
DR
141 }
142
143 return unichar;
ba961854
ZJS
144}
145
0ade5ffe 146bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
6ed62be0 147 const char *p;
7f110ff9
LP
148
149 assert(str);
150
6ed62be0 151 for (p = str; length;) {
dcd12626 152 int encoded_len, val;
7e8185ef 153
6ed62be0 154 encoded_len = utf8_encoded_valid_unichar(p);
7e8185ef 155 if (encoded_len < 0 ||
144b3d9e
LP
156 (size_t) encoded_len > length)
157 return false;
158
6ed62be0 159 val = utf8_encoded_to_unichar(p);
144b3d9e 160 if (val < 0 ||
f3ee6297 161 unichar_is_control(val) ||
0ade5ffe 162 (!newline && val == '\n'))
7991ac34 163 return false;
7f110ff9 164
7991ac34 165 length -= encoded_len;
a7176505 166 p += encoded_len;
7f110ff9
LP
167 }
168
7991ac34 169 return true;
7f110ff9
LP
170}
171
7991ac34
DR
172const char *utf8_is_valid(const char *str) {
173 const uint8_t *p;
7f110ff9
LP
174
175 assert(str);
176
7991ac34 177 for (p = (const uint8_t*) str; *p; ) {
faaa5728
LP
178 int len;
179
180 len = utf8_encoded_valid_unichar((const char *)p);
7991ac34
DR
181 if (len < 0)
182 return NULL;
183
184 p += len;
185 }
7f110ff9 186
7991ac34 187 return str;
7f110ff9
LP
188}
189
550a40ec
ZJS
190char *utf8_escape_invalid(const char *str) {
191 char *p, *s;
192
193 assert(str);
194
195 p = s = malloc(strlen(str) * 4 + 1);
196 if (!p)
197 return NULL;
198
199 while (*str) {
200 int len;
201
202 len = utf8_encoded_valid_unichar(str);
203 if (len > 0) {
204 s = mempcpy(s, str, len);
205 str += len;
206 } else {
3c6d3052 207 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
550a40ec
ZJS
208 str += 1;
209 }
210 }
7e8185ef 211
550a40ec
ZJS
212 *s = '\0';
213
214 return p;
215}
216
fec84576
WC
217char *utf8_escape_non_printable(const char *str) {
218 char *p, *s;
219
220 assert(str);
221
222 p = s = malloc(strlen(str) * 4 + 1);
223 if (!p)
224 return NULL;
225
226 while (*str) {
227 int len;
228
229 len = utf8_encoded_valid_unichar(str);
230 if (len > 0) {
231 if (utf8_is_printable(str, len)) {
232 s = mempcpy(s, str, len);
233 str += len;
234 } else {
3c6d3052 235 while (len > 0) {
fec84576
WC
236 *(s++) = '\\';
237 *(s++) = 'x';
238 *(s++) = hexchar((int) *str >> 4);
239 *(s++) = hexchar((int) *str);
fec84576 240
3c6d3052
LP
241 str += 1;
242 len --;
243 }
fec84576
WC
244 }
245 } else {
3c6d3052 246 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
fec84576
WC
247 str += 1;
248 }
249 }
250
251 *s = '\0';
252
253 return p;
254}
255
7f110ff9
LP
256char *ascii_is_valid(const char *str) {
257 const char *p;
258
259 assert(str);
260
261 for (p = str; *p; p++)
262 if ((unsigned char) *p >= 128)
263 return NULL;
264
265 return (char*) str;
266}
267
2bb4c7e3
TG
268/**
269 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
270 * @out_utf8: output buffer of at least 4 bytes or NULL
271 * @g: UCS-4 character to encode
272 *
273 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
274 * The length of the character is returned. It is not zero-terminated! If the
275 * output buffer is NULL, only the length is returned.
276 *
277 * Returns: The length in bytes that the UTF-8 representation does or would
278 * occupy.
279 */
dcd12626 280size_t utf8_encode_unichar(char *out_utf8, uint32_t g) {
f3ee6297 281
2bb4c7e3
TG
282 if (g < (1 << 7)) {
283 if (out_utf8)
284 out_utf8[0] = g & 0x7f;
e7eebcfc 285 return 1;
2bb4c7e3
TG
286 } else if (g < (1 << 11)) {
287 if (out_utf8) {
288 out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
289 out_utf8[1] = 0x80 | (g & 0x3f);
290 }
e7eebcfc 291 return 2;
2bb4c7e3
TG
292 } else if (g < (1 << 16)) {
293 if (out_utf8) {
294 out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
295 out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
296 out_utf8[2] = 0x80 | (g & 0x3f);
297 }
e7eebcfc 298 return 3;
2bb4c7e3
TG
299 } else if (g < (1 << 21)) {
300 if (out_utf8) {
301 out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
302 out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
303 out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
304 out_utf8[3] = 0x80 | (g & 0x3f);
305 }
306 return 4;
e7eebcfc 307 }
f3ee6297
LP
308
309 return 0;
e7eebcfc
LP
310}
311
2e3d0692 312char *utf16_to_utf8(const void *s, size_t length) {
2e3d0692 313 const uint8_t *f;
e7eebcfc 314 char *r, *t;
2e3d0692 315
04166cb7 316 r = new(char, (length * 4 + 1) / 2 + 1);
2e3d0692
LP
317 if (!r)
318 return NULL;
319
04166cb7
TG
320 f = s;
321 t = r;
322
323 while (f < (const uint8_t*) s + length) {
dcd12626 324 uint16_t w1, w2;
04166cb7
TG
325
326 /* see RFC 2781 section 2.2 */
327
328 w1 = f[1] << 8 | f[0];
329 f += 2;
330
331 if (!utf16_is_surrogate(w1)) {
dcd12626 332 t += utf8_encode_unichar(t, w1);
04166cb7
TG
333
334 continue;
335 }
336
337 if (utf16_is_trailing_surrogate(w1))
338 continue;
339 else if (f >= (const uint8_t*) s + length)
340 break;
341
342 w2 = f[1] << 8 | f[0];
343 f += 2;
344
345 if (!utf16_is_trailing_surrogate(w2)) {
346 f -= 2;
347 continue;
348 }
349
350 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
351 }
2e3d0692
LP
352
353 *t = 0;
7b4d7cc0 354 return r;
2e3d0692 355}
02a36bc9 356
02a36bc9
DR
357/* expected size used to encode one unicode char */
358static int utf8_unichar_to_encoded_len(int unichar) {
7e8185ef 359
02a36bc9
DR
360 if (unichar < 0x80)
361 return 1;
362 if (unichar < 0x800)
363 return 2;
364 if (unichar < 0x10000)
365 return 3;
366 if (unichar < 0x200000)
367 return 4;
368 if (unichar < 0x4000000)
369 return 5;
7e8185ef 370
02a36bc9
DR
371 return 6;
372}
373
374/* validate one encoded unicode char and return its length */
375int utf8_encoded_valid_unichar(const char *str) {
dcd12626 376 int len, unichar, i;
7e8185ef
LP
377
378 assert(str);
02a36bc9
DR
379
380 len = utf8_encoded_expected_len(str);
381 if (len == 0)
7e8185ef 382 return -EINVAL;
02a36bc9
DR
383
384 /* ascii is valid */
385 if (len == 1)
386 return 1;
387
388 /* check if expected encoded chars are available */
389 for (i = 0; i < len; i++)
390 if ((str[i] & 0x80) != 0x80)
7e8185ef 391 return -EINVAL;
02a36bc9
DR
392
393 unichar = utf8_encoded_to_unichar(str);
394
395 /* check if encoded length matches encoded value */
396 if (utf8_unichar_to_encoded_len(unichar) != len)
7e8185ef 397 return -EINVAL;
02a36bc9
DR
398
399 /* check if value has valid range */
f3ee6297 400 if (!unichar_is_valid(unichar))
7e8185ef 401 return -EINVAL;
02a36bc9
DR
402
403 return len;
404}