]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/utf8.c
man: document ARM root partition types
[thirdparty/systemd.git] / src / shared / utf8.c
CommitLineData
7f110ff9
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
036ae95a 6 Copyright 2008-2011 Kay Sievers
7f110ff9
LP
7 Copyright 2012 Lennart Poettering
8
9 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
10 under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation; either version 2.1 of the License, or
7f110ff9
LP
12 (at your option) any later version.
13
14 systemd is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 17 Lesser General Public License for more details.
7f110ff9 18
5430f7f2 19 You should have received a copy of the GNU Lesser General Public License
7f110ff9
LP
20 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21***/
22
036ae95a 23/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9
LP
24 * original license text follows. */
25
26/* gutf8.c - Operations on UTF-8 strings.
27 *
28 * Copyright (C) 1999 Tom Tromey
29 * Copyright (C) 2000 Red Hat, Inc.
30 *
31 * This library is free software; you can redistribute it and/or
23757887 32 * modify it under the terms of the GNU Library General Public
7f110ff9
LP
33 * License as published by the Free Software Foundation; either
34 * version 2 of the License, or (at your option) any later version.
35 *
36 * This library is distributed in the hope that it will be useful,
37 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887
SK
38 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
39 * Library General Public License for more details.
7f110ff9 40 *
23757887
SK
41 * You should have received a copy of the GNU Library General Public
42 * License along with this library; if not, write to the Free Software
43 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9
LP
44 */
45
46#include <errno.h>
47#include <stdlib.h>
48#include <inttypes.h>
49#include <string.h>
50#include <stdbool.h>
51
52#include "utf8.h"
2e3d0692 53#include "util.h"
7f110ff9 54
7f110ff9
LP
55static inline bool is_unicode_valid(uint32_t ch) {
56
57 if (ch >= 0x110000) /* End of unicode space */
58 return false;
59 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
60 return false;
61 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
62 return false;
63 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
64 return false;
65
66 return true;
67}
68
ba961854
ZJS
69static bool is_unicode_control(uint32_t ch) {
70
71 /*
72 0 to ' '-1 is the C0 range.
73 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
74 '\t' is in C0 range, but more or less harmless and commonly used.
75 */
76
31f7bf19 77 return (ch < ' ' && ch != '\t' && ch != '\n') ||
ba961854
ZJS
78 (0x7F <= ch && ch <= 0x9F);
79}
80
7991ac34
DR
81/* count of characters used to encode one unicode char */
82static int utf8_encoded_expected_len(const char *str) {
83 unsigned char c = (unsigned char)str[0];
ba961854 84
7991ac34
DR
85 if (c < 0x80)
86 return 1;
87 if ((c & 0xe0) == 0xc0)
88 return 2;
89 if ((c & 0xf0) == 0xe0)
90 return 3;
91 if ((c & 0xf8) == 0xf0)
92 return 4;
93 if ((c & 0xfc) == 0xf8)
94 return 5;
95 if ((c & 0xfe) == 0xfc)
96 return 6;
97 return 0;
98}
ba961854 99
7991ac34 100/* decode one unicode char */
f405e86d 101int utf8_encoded_to_unichar(const char *str) {
7991ac34
DR
102 int unichar;
103 int len;
104 int i;
ba961854 105
7991ac34
DR
106 len = utf8_encoded_expected_len(str);
107 switch (len) {
108 case 1:
109 return (int)str[0];
110 case 2:
111 unichar = str[0] & 0x1f;
112 break;
113 case 3:
114 unichar = (int)str[0] & 0x0f;
115 break;
116 case 4:
117 unichar = (int)str[0] & 0x07;
118 break;
119 case 5:
120 unichar = (int)str[0] & 0x03;
121 break;
122 case 6:
123 unichar = (int)str[0] & 0x01;
124 break;
125 default:
126 return -1;
ba961854
ZJS
127 }
128
7991ac34
DR
129 for (i = 1; i < len; i++) {
130 if (((int)str[i] & 0xc0) != 0x80)
131 return -1;
132 unichar <<= 6;
133 unichar |= (int)str[i] & 0x3f;
134 }
135
136 return unichar;
ba961854
ZJS
137}
138
7991ac34
DR
139bool utf8_is_printable(const char* str, size_t length) {
140 const uint8_t *p;
7f110ff9
LP
141
142 assert(str);
143
a7176505 144 for (p = (const uint8_t*) str; length;) {
7991ac34 145 int encoded_len = utf8_encoded_valid_unichar((const char *)p);
a7176505 146 int val = utf8_encoded_to_unichar((const char*)p);
7f110ff9 147
7991ac34
DR
148 if (encoded_len < 0 || val < 0 || is_unicode_control(val))
149 return false;
7f110ff9 150
7991ac34 151 length -= encoded_len;
a7176505 152 p += encoded_len;
7f110ff9
LP
153 }
154
7991ac34 155 return true;
7f110ff9
LP
156}
157
7991ac34
DR
158const char *utf8_is_valid(const char *str) {
159 const uint8_t *p;
7f110ff9
LP
160
161 assert(str);
162
7991ac34 163 for (p = (const uint8_t*) str; *p; ) {
faaa5728
LP
164 int len;
165
166 len = utf8_encoded_valid_unichar((const char *)p);
7991ac34
DR
167
168 if (len < 0)
169 return NULL;
170
171 p += len;
172 }
7f110ff9 173
7991ac34 174 return str;
7f110ff9
LP
175}
176
550a40ec
ZJS
177char *utf8_escape_invalid(const char *str) {
178 char *p, *s;
179
180 assert(str);
181
182 p = s = malloc(strlen(str) * 4 + 1);
183 if (!p)
184 return NULL;
185
186 while (*str) {
187 int len;
188
189 len = utf8_encoded_valid_unichar(str);
190 if (len > 0) {
191 s = mempcpy(s, str, len);
192 str += len;
193 } else {
194 s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER));
195 str += 1;
196 }
197 }
198 *s = '\0';
199
200 return p;
201}
202
7f110ff9
LP
203char *ascii_is_valid(const char *str) {
204 const char *p;
205
206 assert(str);
207
208 for (p = str; *p; p++)
209 if ((unsigned char) *p >= 128)
210 return NULL;
211
212 return (char*) str;
213}
214
2e3d0692
LP
215char *utf16_to_utf8(const void *s, size_t length) {
216 char *r;
217 const uint8_t *f;
218 uint8_t *t;
219
220 r = new(char, (length*3+1)/2 + 1);
221 if (!r)
222 return NULL;
223
224 t = (uint8_t*) r;
225
226 for (f = s; f < (const uint8_t*) s + length; f += 2) {
227 uint16_t c;
228
229 c = (f[1] << 8) | f[0];
230
231 if (c == 0) {
232 *t = 0;
233 return r;
234 } else if (c < 0x80) {
235 *(t++) = (uint8_t) c;
236 } else if (c < 0x800) {
237 *(t++) = (uint8_t) (0xc0 | (c >> 6));
238 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
239 } else {
240 *(t++) = (uint8_t) (0xe0 | (c >> 12));
241 *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
242 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
243 }
244 }
245
246 *t = 0;
2e3d0692 247
7b4d7cc0 248 return r;
2e3d0692 249}
02a36bc9 250
02a36bc9
DR
251/* expected size used to encode one unicode char */
252static int utf8_unichar_to_encoded_len(int unichar) {
253 if (unichar < 0x80)
254 return 1;
255 if (unichar < 0x800)
256 return 2;
257 if (unichar < 0x10000)
258 return 3;
259 if (unichar < 0x200000)
260 return 4;
261 if (unichar < 0x4000000)
262 return 5;
263 return 6;
264}
265
266/* validate one encoded unicode char and return its length */
267int utf8_encoded_valid_unichar(const char *str) {
268 int len;
269 int unichar;
270 int i;
271
272 len = utf8_encoded_expected_len(str);
273 if (len == 0)
274 return -1;
275
276 /* ascii is valid */
277 if (len == 1)
278 return 1;
279
280 /* check if expected encoded chars are available */
281 for (i = 0; i < len; i++)
282 if ((str[i] & 0x80) != 0x80)
283 return -1;
284
285 unichar = utf8_encoded_to_unichar(str);
286
287 /* check if encoded length matches encoded value */
288 if (utf8_unichar_to_encoded_len(unichar) != len)
289 return -1;
290
291 /* check if value has valid range */
292 if (!is_unicode_valid(unichar))
293 return -1;
294
295 return len;
296}