]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/utf8.c
fix a couple of issues found with llvm-analyze
[thirdparty/systemd.git] / src / shared / utf8.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2012 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 /* This file is based on the GLIB utf8 validation functions. The
23 * original license text follows. */
24
25 /* gutf8.c - Operations on UTF-8 strings.
26 *
27 * Copyright (C) 1999 Tom Tromey
28 * Copyright (C) 2000 Red Hat, Inc.
29 *
30 * This library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Lesser General Public
32 * License as published by the Free Software Foundation; either
33 * version 2 of the License, or (at your option) any later version.
34 *
35 * This library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Lesser General Public License for more details.
39 *
40 * You should have received a copy of the GNU Lesser General Public
41 * License along with this library; if not, write to the
42 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
43 * Boston, MA 02111-1307, USA.
44 */
45
46 #include <errno.h>
47 #include <stdlib.h>
48 #include <inttypes.h>
49 #include <string.h>
50 #include <stdbool.h>
51
52 #include "utf8.h"
53
54 #define FILTER_CHAR '_'
55
56 static inline bool is_unicode_valid(uint32_t ch) {
57
58 if (ch >= 0x110000) /* End of unicode space */
59 return false;
60 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
61 return false;
62 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
63 return false;
64 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
65 return false;
66
67 return true;
68 }
69
70 static inline bool is_continuation_char(uint8_t ch) {
71 if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
72 return false;
73 return true;
74 }
75
76 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
77 *u_ch <<= 6;
78 *u_ch |= ch & 0x3f;
79 }
80
81 static bool is_unicode_control(uint32_t ch) {
82
83 /*
84 0 to ' '-1 is the C0 range.
85 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
86 '\t' is in C0 range, but more or less harmless and commonly used.
87 */
88
89 return (ch < ' ' && ch != '\t') ||
90 (0x7F <= ch && ch <= 0x9F);
91 }
92
93 char* utf8_is_printable_n(const char* str, size_t length) {
94 uint32_t val = 0;
95 uint32_t min = 0;
96 const uint8_t *p;
97
98 assert(str);
99
100 for (p = (const uint8_t*) str; length; p++, length--) {
101 if (*p < 128) {
102 val = *p;
103 } else {
104 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
105 min = 128;
106 val = (uint32_t) (*p & 0x1e);
107 goto ONE_REMAINING;
108 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
109 min = (1 << 11);
110 val = (uint32_t) (*p & 0x0f);
111 goto TWO_REMAINING;
112 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
113 min = (1 << 16);
114 val = (uint32_t) (*p & 0x07);
115 } else
116 goto error;
117
118 p++;
119 length--;
120 if (!length || !is_continuation_char(*p))
121 goto error;
122 merge_continuation_char(&val, *p);
123
124 TWO_REMAINING:
125 p++;
126 length--;
127 if (!is_continuation_char(*p))
128 goto error;
129 merge_continuation_char(&val, *p);
130
131 ONE_REMAINING:
132 p++;
133 length--;
134 if (!is_continuation_char(*p))
135 goto error;
136 merge_continuation_char(&val, *p);
137
138 if (val < min)
139 goto error;
140 }
141
142 if (is_unicode_control(val))
143 goto error;
144 }
145
146 return (char*) str;
147
148 error:
149 return NULL;
150 }
151
152 static char* utf8_validate(const char *str, char *output) {
153 uint32_t val = 0;
154 uint32_t min = 0;
155 const uint8_t *p, *last;
156 int size;
157 uint8_t *o;
158
159 assert(str);
160
161 o = (uint8_t*) output;
162 for (p = (const uint8_t*) str; *p; p++) {
163 if (*p < 128) {
164 if (o)
165 *o = *p;
166 } else {
167 last = p;
168
169 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
170 size = 2;
171 min = 128;
172 val = (uint32_t) (*p & 0x1e);
173 goto ONE_REMAINING;
174 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
175 size = 3;
176 min = (1 << 11);
177 val = (uint32_t) (*p & 0x0f);
178 goto TWO_REMAINING;
179 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
180 size = 4;
181 min = (1 << 16);
182 val = (uint32_t) (*p & 0x07);
183 } else
184 goto error;
185
186 p++;
187 if (!is_continuation_char(*p))
188 goto error;
189 merge_continuation_char(&val, *p);
190
191 TWO_REMAINING:
192 p++;
193 if (!is_continuation_char(*p))
194 goto error;
195 merge_continuation_char(&val, *p);
196
197 ONE_REMAINING:
198 p++;
199 if (!is_continuation_char(*p))
200 goto error;
201 merge_continuation_char(&val, *p);
202
203 if (val < min)
204 goto error;
205
206 if (!is_unicode_valid(val))
207 goto error;
208
209 if (o) {
210 memcpy(o, last, (size_t) size);
211 o += size;
212 }
213
214 continue;
215
216 error:
217 if (o) {
218 *o = FILTER_CHAR;
219 p = last; /* We retry at the next character */
220 } else
221 goto failure;
222 }
223
224 if (o)
225 o++;
226 }
227
228 if (o) {
229 *o = '\0';
230 return output;
231 }
232
233 return (char*) str;
234
235 failure:
236 return NULL;
237 }
238
239 char* utf8_is_valid (const char *str) {
240 return utf8_validate(str, NULL);
241 }
242
243 char* utf8_filter (const char *str) {
244 char *new_str;
245
246 assert(str);
247
248 new_str = malloc(strlen(str) + 1);
249 if (!new_str)
250 return NULL;
251
252 return utf8_validate(str, new_str);
253 }
254
255 char *ascii_is_valid(const char *str) {
256 const char *p;
257
258 assert(str);
259
260 for (p = str; *p; p++)
261 if ((unsigned char) *p >= 128)
262 return NULL;
263
264 return (char*) str;
265 }
266
267 char *ascii_filter(const char *str) {
268 char *r, *s, *d;
269 size_t l;
270
271 assert(str);
272
273 l = strlen(str);
274 r = malloc(l + 1);
275 if (!r)
276 return NULL;
277
278 for (s = str, d = r; *s; s++)
279 if ((unsigned char) *s < 128)
280 *(d++) = *s;
281
282 *d = 0;
283
284 return r;
285 }