]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/utf8.c
1a68394a53cee3d58d06a8dc08f114f9dcb1a1ff
[thirdparty/systemd.git] / src / shared / utf8.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2012 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 /* This file is based on the GLIB utf8 validation functions. The
23 * original license text follows. */
24
25 /* gutf8.c - Operations on UTF-8 strings.
26 *
27 * Copyright (C) 1999 Tom Tromey
28 * Copyright (C) 2000 Red Hat, Inc.
29 *
30 * This library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Library General Public
32 * License as published by the Free Software Foundation; either
33 * version 2 of the License, or (at your option) any later version.
34 *
35 * This library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Library General Public License for more details.
39 *
40 * You should have received a copy of the GNU Library General Public
41 * License along with this library; if not, write to the Free Software
42 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
43 */
44
45 #include <errno.h>
46 #include <stdlib.h>
47 #include <inttypes.h>
48 #include <string.h>
49 #include <stdbool.h>
50
51 #include "utf8.h"
52 #include "util.h"
53
54 #define FILTER_CHAR '_'
55
56 static inline bool is_unicode_valid(uint32_t ch) {
57
58 if (ch >= 0x110000) /* End of unicode space */
59 return false;
60 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
61 return false;
62 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
63 return false;
64 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
65 return false;
66
67 return true;
68 }
69
70 static inline bool is_continuation_char(uint8_t ch) {
71 if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
72 return false;
73 return true;
74 }
75
76 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
77 *u_ch <<= 6;
78 *u_ch |= ch & 0x3f;
79 }
80
81 static bool is_unicode_control(uint32_t ch) {
82
83 /*
84 0 to ' '-1 is the C0 range.
85 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
86 '\t' is in C0 range, but more or less harmless and commonly used.
87 */
88
89 return (ch < ' ' && ch != '\t' && ch != '\n') ||
90 (0x7F <= ch && ch <= 0x9F);
91 }
92
93 bool utf8_is_printable(const char* str, size_t length) {
94 uint32_t val = 0;
95 uint32_t min = 0;
96 const uint8_t *p;
97
98 assert(str);
99
100 for (p = (const uint8_t*) str; length; p++, length--) {
101 if (*p < 128) {
102 val = *p;
103 } else {
104 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
105 min = 128;
106 val = (uint32_t) (*p & 0x1e);
107 goto ONE_REMAINING;
108 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
109 min = (1 << 11);
110 val = (uint32_t) (*p & 0x0f);
111 goto TWO_REMAINING;
112 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
113 min = (1 << 16);
114 val = (uint32_t) (*p & 0x07);
115 } else
116 return false;
117
118 p++;
119 length--;
120 if (!length || !is_continuation_char(*p))
121 return false;
122 merge_continuation_char(&val, *p);
123
124 TWO_REMAINING:
125 p++;
126 length--;
127 if (!is_continuation_char(*p))
128 return false;
129 merge_continuation_char(&val, *p);
130
131 ONE_REMAINING:
132 p++;
133 length--;
134 if (!is_continuation_char(*p))
135 return false;
136 merge_continuation_char(&val, *p);
137
138 if (val < min)
139 return false;
140 }
141
142 if (is_unicode_control(val))
143 return false;
144 }
145
146 return true;
147 }
148
149 static char* utf8_validate(const char *str, char *output) {
150 uint32_t val = 0;
151 uint32_t min = 0;
152 const uint8_t *p, *last;
153 int size;
154 uint8_t *o;
155
156 assert(str);
157
158 o = (uint8_t*) output;
159 for (p = (const uint8_t*) str; *p; p++) {
160 if (*p < 128) {
161 if (o)
162 *o = *p;
163 } else {
164 last = p;
165
166 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
167 size = 2;
168 min = 128;
169 val = (uint32_t) (*p & 0x1e);
170 goto ONE_REMAINING;
171 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
172 size = 3;
173 min = (1 << 11);
174 val = (uint32_t) (*p & 0x0f);
175 goto TWO_REMAINING;
176 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
177 size = 4;
178 min = (1 << 16);
179 val = (uint32_t) (*p & 0x07);
180 } else
181 goto error;
182
183 p++;
184 if (!is_continuation_char(*p))
185 goto error;
186 merge_continuation_char(&val, *p);
187
188 TWO_REMAINING:
189 p++;
190 if (!is_continuation_char(*p))
191 goto error;
192 merge_continuation_char(&val, *p);
193
194 ONE_REMAINING:
195 p++;
196 if (!is_continuation_char(*p))
197 goto error;
198 merge_continuation_char(&val, *p);
199
200 if (val < min)
201 goto error;
202
203 if (!is_unicode_valid(val))
204 goto error;
205
206 if (o) {
207 memcpy(o, last, (size_t) size);
208 o += size;
209 }
210
211 continue;
212
213 error:
214 if (o) {
215 *o = FILTER_CHAR;
216 p = last; /* We retry at the next character */
217 } else
218 goto failure;
219 }
220
221 if (o)
222 o++;
223 }
224
225 if (o) {
226 *o = '\0';
227 return output;
228 }
229
230 return (char*) str;
231
232 failure:
233 return NULL;
234 }
235
236 char* utf8_is_valid (const char *str) {
237 return utf8_validate(str, NULL);
238 }
239
240 char* utf8_filter (const char *str) {
241 char *new_str;
242
243 assert(str);
244
245 new_str = malloc(strlen(str) + 1);
246 if (!new_str)
247 return NULL;
248
249 return utf8_validate(str, new_str);
250 }
251
252 char *ascii_is_valid(const char *str) {
253 const char *p;
254
255 assert(str);
256
257 for (p = str; *p; p++)
258 if ((unsigned char) *p >= 128)
259 return NULL;
260
261 return (char*) str;
262 }
263
264 char *ascii_filter(const char *str) {
265 const char *s;
266 char *r, *d;
267 size_t l;
268
269 assert(str);
270
271 l = strlen(str);
272 r = malloc(l + 1);
273 if (!r)
274 return NULL;
275
276 for (s = str, d = r; *s; s++)
277 if ((unsigned char) *s < 128)
278 *(d++) = *s;
279
280 *d = 0;
281
282 return r;
283 }
284
285 char *utf16_to_utf8(const void *s, size_t length) {
286 char *r;
287 const uint8_t *f;
288 uint8_t *t;
289
290 r = new(char, (length*3+1)/2 + 1);
291 if (!r)
292 return NULL;
293
294 t = (uint8_t*) r;
295
296 for (f = s; f < (const uint8_t*) s + length; f += 2) {
297 uint16_t c;
298
299 c = (f[1] << 8) | f[0];
300
301 if (c == 0) {
302 *t = 0;
303 return r;
304 } else if (c < 0x80) {
305 *(t++) = (uint8_t) c;
306 } else if (c < 0x800) {
307 *(t++) = (uint8_t) (0xc0 | (c >> 6));
308 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
309 } else {
310 *(t++) = (uint8_t) (0xe0 | (c >> 12));
311 *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
312 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
313 }
314 }
315
316 *t = 0;
317
318 return r;
319 }
320
321 /* count of characters used to encode one unicode char */
322 static int utf8_encoded_expected_len(const char *str) {
323 unsigned char c = (unsigned char)str[0];
324
325 if (c < 0x80)
326 return 1;
327 if ((c & 0xe0) == 0xc0)
328 return 2;
329 if ((c & 0xf0) == 0xe0)
330 return 3;
331 if ((c & 0xf8) == 0xf0)
332 return 4;
333 if ((c & 0xfc) == 0xf8)
334 return 5;
335 if ((c & 0xfe) == 0xfc)
336 return 6;
337 return 0;
338 }
339
340 /* decode one unicode char */
341 static int utf8_encoded_to_unichar(const char *str) {
342 int unichar;
343 int len;
344 int i;
345
346 len = utf8_encoded_expected_len(str);
347 switch (len) {
348 case 1:
349 return (int)str[0];
350 case 2:
351 unichar = str[0] & 0x1f;
352 break;
353 case 3:
354 unichar = (int)str[0] & 0x0f;
355 break;
356 case 4:
357 unichar = (int)str[0] & 0x07;
358 break;
359 case 5:
360 unichar = (int)str[0] & 0x03;
361 break;
362 case 6:
363 unichar = (int)str[0] & 0x01;
364 break;
365 default:
366 return -1;
367 }
368
369 for (i = 1; i < len; i++) {
370 if (((int)str[i] & 0xc0) != 0x80)
371 return -1;
372 unichar <<= 6;
373 unichar |= (int)str[i] & 0x3f;
374 }
375
376 return unichar;
377 }
378
379 /* expected size used to encode one unicode char */
380 static int utf8_unichar_to_encoded_len(int unichar) {
381 if (unichar < 0x80)
382 return 1;
383 if (unichar < 0x800)
384 return 2;
385 if (unichar < 0x10000)
386 return 3;
387 if (unichar < 0x200000)
388 return 4;
389 if (unichar < 0x4000000)
390 return 5;
391 return 6;
392 }
393
394 /* validate one encoded unicode char and return its length */
395 int utf8_encoded_valid_unichar(const char *str) {
396 int len;
397 int unichar;
398 int i;
399
400 len = utf8_encoded_expected_len(str);
401 if (len == 0)
402 return -1;
403
404 /* ascii is valid */
405 if (len == 1)
406 return 1;
407
408 /* check if expected encoded chars are available */
409 for (i = 0; i < len; i++)
410 if ((str[i] & 0x80) != 0x80)
411 return -1;
412
413 unichar = utf8_encoded_to_unichar(str);
414
415 /* check if encoded length matches encoded value */
416 if (utf8_unichar_to_encoded_len(unichar) != len)
417 return -1;
418
419 /* check if value has valid range */
420 if (!is_unicode_valid(unichar))
421 return -1;
422
423 return len;
424 }
425
426 int is_utf8_encoding_whitelisted(char c, const char *white) {
427 if ((c >= '0' && c <= '9') ||
428 (c >= 'A' && c <= 'Z') ||
429 (c >= 'a' && c <= 'z') ||
430 strchr("#+-.:=@_", c) != NULL ||
431 (white != NULL && strchr(white, c) != NULL))
432 return 1;
433 return 0;
434 }
435
436 int udev_encode_string(const char *str, char *str_enc, size_t len) {
437 size_t i, j;
438
439 if (str == NULL || str_enc == NULL)
440 return -1;
441
442 for (i = 0, j = 0; str[i] != '\0'; i++) {
443 int seqlen;
444
445 seqlen = utf8_encoded_valid_unichar(&str[i]);
446 if (seqlen > 1) {
447 if (len-j < (size_t)seqlen)
448 goto err;
449 memcpy(&str_enc[j], &str[i], seqlen);
450 j += seqlen;
451 i += (seqlen-1);
452 } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) {
453 if (len-j < 4)
454 goto err;
455 sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
456 j += 4;
457 } else {
458 if (len-j < 1)
459 goto err;
460 str_enc[j] = str[i];
461 j++;
462 }
463 }
464 if (len-j < 1)
465 goto err;
466 str_enc[j] = '\0';
467 return 0;
468 err:
469 return -1;
470 }