]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/utf8.c
utf8: when escaping unprintable unichars, escape the whole unichar, not just the...
[thirdparty/systemd.git] / src / shared / utf8.c
CommitLineData
7f110ff9
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
036ae95a 6 Copyright 2008-2011 Kay Sievers
7f110ff9
LP
7 Copyright 2012 Lennart Poettering
8
9 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
10 under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation; either version 2.1 of the License, or
7f110ff9
LP
12 (at your option) any later version.
13
14 systemd is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 17 Lesser General Public License for more details.
7f110ff9 18
5430f7f2 19 You should have received a copy of the GNU Lesser General Public License
7f110ff9
LP
20 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21***/
22
036ae95a 23/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9
LP
24 * original license text follows. */
25
26/* gutf8.c - Operations on UTF-8 strings.
27 *
28 * Copyright (C) 1999 Tom Tromey
29 * Copyright (C) 2000 Red Hat, Inc.
30 *
31 * This library is free software; you can redistribute it and/or
23757887 32 * modify it under the terms of the GNU Library General Public
7f110ff9
LP
33 * License as published by the Free Software Foundation; either
34 * version 2 of the License, or (at your option) any later version.
35 *
36 * This library is distributed in the hope that it will be useful,
37 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887
SK
38 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
39 * Library General Public License for more details.
7f110ff9 40 *
23757887
SK
41 * You should have received a copy of the GNU Library General Public
42 * License along with this library; if not, write to the Free Software
43 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9
LP
44 */
45
46#include <errno.h>
47#include <stdlib.h>
48#include <inttypes.h>
49#include <string.h>
50#include <stdbool.h>
51
52#include "utf8.h"
2e3d0692 53#include "util.h"
7f110ff9 54
7f110ff9
LP
55static inline bool is_unicode_valid(uint32_t ch) {
56
57 if (ch >= 0x110000) /* End of unicode space */
58 return false;
59 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
60 return false;
61 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
62 return false;
63 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
64 return false;
65
66 return true;
67}
68
ba961854
ZJS
69static bool is_unicode_control(uint32_t ch) {
70
71 /*
72 0 to ' '-1 is the C0 range.
73 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
74 '\t' is in C0 range, but more or less harmless and commonly used.
75 */
76
31f7bf19 77 return (ch < ' ' && ch != '\t' && ch != '\n') ||
ba961854
ZJS
78 (0x7F <= ch && ch <= 0x9F);
79}
80
7991ac34
DR
81/* count of characters used to encode one unicode char */
82static int utf8_encoded_expected_len(const char *str) {
7e8185ef 83 unsigned char c;
ba961854 84
7e8185ef
LP
85 assert(str);
86
87 c = (unsigned char) str[0];
7991ac34
DR
88 if (c < 0x80)
89 return 1;
90 if ((c & 0xe0) == 0xc0)
91 return 2;
92 if ((c & 0xf0) == 0xe0)
93 return 3;
94 if ((c & 0xf8) == 0xf0)
95 return 4;
96 if ((c & 0xfc) == 0xf8)
97 return 5;
98 if ((c & 0xfe) == 0xfc)
99 return 6;
7e8185ef 100
7991ac34
DR
101 return 0;
102}
ba961854 103
7991ac34 104/* decode one unicode char */
f405e86d 105int utf8_encoded_to_unichar(const char *str) {
7e8185ef
LP
106 int unichar, len, i;
107
108 assert(str);
ba961854 109
7991ac34 110 len = utf8_encoded_expected_len(str);
7e8185ef 111
7991ac34
DR
112 switch (len) {
113 case 1:
114 return (int)str[0];
115 case 2:
116 unichar = str[0] & 0x1f;
117 break;
118 case 3:
119 unichar = (int)str[0] & 0x0f;
120 break;
121 case 4:
122 unichar = (int)str[0] & 0x07;
123 break;
124 case 5:
125 unichar = (int)str[0] & 0x03;
126 break;
127 case 6:
128 unichar = (int)str[0] & 0x01;
129 break;
130 default:
7e8185ef 131 return -EINVAL;
ba961854
ZJS
132 }
133
7991ac34
DR
134 for (i = 1; i < len; i++) {
135 if (((int)str[i] & 0xc0) != 0x80)
7e8185ef 136 return -EINVAL;
7991ac34
DR
137 unichar <<= 6;
138 unichar |= (int)str[i] & 0x3f;
139 }
140
141 return unichar;
ba961854
ZJS
142}
143
0ade5ffe 144bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
7991ac34 145 const uint8_t *p;
7f110ff9
LP
146
147 assert(str);
148
a7176505 149 for (p = (const uint8_t*) str; length;) {
7e8185ef
LP
150 int encoded_len, val;
151
152 encoded_len = utf8_encoded_valid_unichar((const char *) p);
7e8185ef 153 if (encoded_len < 0 ||
144b3d9e
LP
154 (size_t) encoded_len > length)
155 return false;
156
157 val = utf8_encoded_to_unichar((const char*) p);
158 if (val < 0 ||
7e8185ef 159 is_unicode_control(val) ||
0ade5ffe 160 (!newline && val == '\n'))
7991ac34 161 return false;
7f110ff9 162
7991ac34 163 length -= encoded_len;
a7176505 164 p += encoded_len;
7f110ff9
LP
165 }
166
7991ac34 167 return true;
7f110ff9
LP
168}
169
7991ac34
DR
170const char *utf8_is_valid(const char *str) {
171 const uint8_t *p;
7f110ff9
LP
172
173 assert(str);
174
7991ac34 175 for (p = (const uint8_t*) str; *p; ) {
faaa5728
LP
176 int len;
177
178 len = utf8_encoded_valid_unichar((const char *)p);
7991ac34
DR
179 if (len < 0)
180 return NULL;
181
182 p += len;
183 }
7f110ff9 184
7991ac34 185 return str;
7f110ff9
LP
186}
187
550a40ec
ZJS
188char *utf8_escape_invalid(const char *str) {
189 char *p, *s;
190
191 assert(str);
192
193 p = s = malloc(strlen(str) * 4 + 1);
194 if (!p)
195 return NULL;
196
197 while (*str) {
198 int len;
199
200 len = utf8_encoded_valid_unichar(str);
201 if (len > 0) {
202 s = mempcpy(s, str, len);
203 str += len;
204 } else {
3c6d3052 205 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
550a40ec
ZJS
206 str += 1;
207 }
208 }
7e8185ef 209
550a40ec
ZJS
210 *s = '\0';
211
212 return p;
213}
214
fec84576
WC
215char *utf8_escape_non_printable(const char *str) {
216 char *p, *s;
217
218 assert(str);
219
220 p = s = malloc(strlen(str) * 4 + 1);
221 if (!p)
222 return NULL;
223
224 while (*str) {
225 int len;
226
227 len = utf8_encoded_valid_unichar(str);
228 if (len > 0) {
229 if (utf8_is_printable(str, len)) {
230 s = mempcpy(s, str, len);
231 str += len;
232 } else {
3c6d3052 233 while (len > 0) {
fec84576
WC
234 *(s++) = '\\';
235 *(s++) = 'x';
236 *(s++) = hexchar((int) *str >> 4);
237 *(s++) = hexchar((int) *str);
fec84576 238
3c6d3052
LP
239 str += 1;
240 len --;
241 }
fec84576
WC
242 }
243 } else {
3c6d3052 244 s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
fec84576
WC
245 str += 1;
246 }
247 }
248
249 *s = '\0';
250
251 return p;
252}
253
7f110ff9
LP
254char *ascii_is_valid(const char *str) {
255 const char *p;
256
257 assert(str);
258
259 for (p = str; *p; p++)
260 if ((unsigned char) *p >= 128)
261 return NULL;
262
263 return (char*) str;
264}
265
2e3d0692
LP
266char *utf16_to_utf8(const void *s, size_t length) {
267 char *r;
268 const uint8_t *f;
269 uint8_t *t;
270
271 r = new(char, (length*3+1)/2 + 1);
272 if (!r)
273 return NULL;
274
275 t = (uint8_t*) r;
276
277 for (f = s; f < (const uint8_t*) s + length; f += 2) {
278 uint16_t c;
279
280 c = (f[1] << 8) | f[0];
281
282 if (c == 0) {
283 *t = 0;
284 return r;
285 } else if (c < 0x80) {
286 *(t++) = (uint8_t) c;
287 } else if (c < 0x800) {
288 *(t++) = (uint8_t) (0xc0 | (c >> 6));
289 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
290 } else {
291 *(t++) = (uint8_t) (0xe0 | (c >> 12));
292 *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
293 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
294 }
295 }
296
297 *t = 0;
2e3d0692 298
7b4d7cc0 299 return r;
2e3d0692 300}
02a36bc9 301
02a36bc9
DR
302/* expected size used to encode one unicode char */
303static int utf8_unichar_to_encoded_len(int unichar) {
7e8185ef 304
02a36bc9
DR
305 if (unichar < 0x80)
306 return 1;
307 if (unichar < 0x800)
308 return 2;
309 if (unichar < 0x10000)
310 return 3;
311 if (unichar < 0x200000)
312 return 4;
313 if (unichar < 0x4000000)
314 return 5;
7e8185ef 315
02a36bc9
DR
316 return 6;
317}
318
319/* validate one encoded unicode char and return its length */
320int utf8_encoded_valid_unichar(const char *str) {
7e8185ef
LP
321 int len, unichar, i;
322
323 assert(str);
02a36bc9
DR
324
325 len = utf8_encoded_expected_len(str);
326 if (len == 0)
7e8185ef 327 return -EINVAL;
02a36bc9
DR
328
329 /* ascii is valid */
330 if (len == 1)
331 return 1;
332
333 /* check if expected encoded chars are available */
334 for (i = 0; i < len; i++)
335 if ((str[i] & 0x80) != 0x80)
7e8185ef 336 return -EINVAL;
02a36bc9
DR
337
338 unichar = utf8_encoded_to_unichar(str);
339
340 /* check if encoded length matches encoded value */
341 if (utf8_unichar_to_encoded_len(unichar) != len)
7e8185ef 342 return -EINVAL;
02a36bc9
DR
343
344 /* check if value has valid range */
345 if (!is_unicode_valid(unichar))
7e8185ef 346 return -EINVAL;
02a36bc9
DR
347
348 return len;
349}