]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/utf8.c
utf8: ascii_filter() is unused, let's remove it
[thirdparty/systemd.git] / src / shared / utf8.c
CommitLineData
7f110ff9
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
036ae95a 6 Copyright 2008-2011 Kay Sievers
7f110ff9
LP
7 Copyright 2012 Lennart Poettering
8
9 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
10 under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation; either version 2.1 of the License, or
7f110ff9
LP
12 (at your option) any later version.
13
14 systemd is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 17 Lesser General Public License for more details.
7f110ff9 18
5430f7f2 19 You should have received a copy of the GNU Lesser General Public License
7f110ff9
LP
20 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21***/
22
036ae95a 23/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9
LP
24 * original license text follows. */
25
26/* gutf8.c - Operations on UTF-8 strings.
27 *
28 * Copyright (C) 1999 Tom Tromey
29 * Copyright (C) 2000 Red Hat, Inc.
30 *
31 * This library is free software; you can redistribute it and/or
23757887 32 * modify it under the terms of the GNU Library General Public
7f110ff9
LP
33 * License as published by the Free Software Foundation; either
34 * version 2 of the License, or (at your option) any later version.
35 *
36 * This library is distributed in the hope that it will be useful,
37 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887
SK
38 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
39 * Library General Public License for more details.
7f110ff9 40 *
23757887
SK
41 * You should have received a copy of the GNU Library General Public
42 * License along with this library; if not, write to the Free Software
43 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9
LP
44 */
45
46#include <errno.h>
47#include <stdlib.h>
48#include <inttypes.h>
49#include <string.h>
50#include <stdbool.h>
51
52#include "utf8.h"
2e3d0692 53#include "util.h"
7f110ff9 54
7f110ff9
LP
55static inline bool is_unicode_valid(uint32_t ch) {
56
57 if (ch >= 0x110000) /* End of unicode space */
58 return false;
59 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
60 return false;
61 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
62 return false;
63 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
64 return false;
65
66 return true;
67}
68
ba961854
ZJS
69static bool is_unicode_control(uint32_t ch) {
70
71 /*
72 0 to ' '-1 is the C0 range.
73 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
74 '\t' is in C0 range, but more or less harmless and commonly used.
75 */
76
31f7bf19 77 return (ch < ' ' && ch != '\t' && ch != '\n') ||
ba961854
ZJS
78 (0x7F <= ch && ch <= 0x9F);
79}
80
7991ac34
DR
81/* count of characters used to encode one unicode char */
82static int utf8_encoded_expected_len(const char *str) {
83 unsigned char c = (unsigned char)str[0];
ba961854 84
7991ac34
DR
85 if (c < 0x80)
86 return 1;
87 if ((c & 0xe0) == 0xc0)
88 return 2;
89 if ((c & 0xf0) == 0xe0)
90 return 3;
91 if ((c & 0xf8) == 0xf0)
92 return 4;
93 if ((c & 0xfc) == 0xf8)
94 return 5;
95 if ((c & 0xfe) == 0xfc)
96 return 6;
97 return 0;
98}
ba961854 99
7991ac34 100/* decode one unicode char */
f405e86d 101int utf8_encoded_to_unichar(const char *str) {
7991ac34
DR
102 int unichar;
103 int len;
104 int i;
ba961854 105
7991ac34
DR
106 len = utf8_encoded_expected_len(str);
107 switch (len) {
108 case 1:
109 return (int)str[0];
110 case 2:
111 unichar = str[0] & 0x1f;
112 break;
113 case 3:
114 unichar = (int)str[0] & 0x0f;
115 break;
116 case 4:
117 unichar = (int)str[0] & 0x07;
118 break;
119 case 5:
120 unichar = (int)str[0] & 0x03;
121 break;
122 case 6:
123 unichar = (int)str[0] & 0x01;
124 break;
125 default:
126 return -1;
ba961854
ZJS
127 }
128
7991ac34
DR
129 for (i = 1; i < len; i++) {
130 if (((int)str[i] & 0xc0) != 0x80)
131 return -1;
132 unichar <<= 6;
133 unichar |= (int)str[i] & 0x3f;
134 }
135
136 return unichar;
ba961854
ZJS
137}
138
7991ac34
DR
139bool utf8_is_printable(const char* str, size_t length) {
140 const uint8_t *p;
7f110ff9
LP
141
142 assert(str);
143
a7176505 144 for (p = (const uint8_t*) str; length;) {
7991ac34 145 int encoded_len = utf8_encoded_valid_unichar((const char *)p);
a7176505 146 int val = utf8_encoded_to_unichar((const char*)p);
7f110ff9 147
7991ac34
DR
148 if (encoded_len < 0 || val < 0 || is_unicode_control(val))
149 return false;
7f110ff9 150
7991ac34 151 length -= encoded_len;
a7176505 152 p += encoded_len;
7f110ff9
LP
153 }
154
7991ac34 155 return true;
7f110ff9
LP
156}
157
7991ac34
DR
158const char *utf8_is_valid(const char *str) {
159 const uint8_t *p;
7f110ff9
LP
160
161 assert(str);
162
7991ac34
DR
163 for (p = (const uint8_t*) str; *p; ) {
164 int len = utf8_encoded_valid_unichar((const char *)p);
165
166 if (len < 0)
167 return NULL;
168
169 p += len;
170 }
7f110ff9 171
7991ac34 172 return str;
7f110ff9
LP
173}
174
175char *ascii_is_valid(const char *str) {
176 const char *p;
177
178 assert(str);
179
180 for (p = str; *p; p++)
181 if ((unsigned char) *p >= 128)
182 return NULL;
183
184 return (char*) str;
185}
186
2e3d0692
LP
187char *utf16_to_utf8(const void *s, size_t length) {
188 char *r;
189 const uint8_t *f;
190 uint8_t *t;
191
192 r = new(char, (length*3+1)/2 + 1);
193 if (!r)
194 return NULL;
195
196 t = (uint8_t*) r;
197
198 for (f = s; f < (const uint8_t*) s + length; f += 2) {
199 uint16_t c;
200
201 c = (f[1] << 8) | f[0];
202
203 if (c == 0) {
204 *t = 0;
205 return r;
206 } else if (c < 0x80) {
207 *(t++) = (uint8_t) c;
208 } else if (c < 0x800) {
209 *(t++) = (uint8_t) (0xc0 | (c >> 6));
210 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
211 } else {
212 *(t++) = (uint8_t) (0xe0 | (c >> 12));
213 *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
214 *(t++) = (uint8_t) (0x80 | (c & 0x3f));
215 }
216 }
217
218 *t = 0;
2e3d0692 219
7b4d7cc0 220 return r;
2e3d0692 221}
02a36bc9 222
02a36bc9
DR
223/* expected size used to encode one unicode char */
224static int utf8_unichar_to_encoded_len(int unichar) {
225 if (unichar < 0x80)
226 return 1;
227 if (unichar < 0x800)
228 return 2;
229 if (unichar < 0x10000)
230 return 3;
231 if (unichar < 0x200000)
232 return 4;
233 if (unichar < 0x4000000)
234 return 5;
235 return 6;
236}
237
238/* validate one encoded unicode char and return its length */
239int utf8_encoded_valid_unichar(const char *str) {
240 int len;
241 int unichar;
242 int i;
243
244 len = utf8_encoded_expected_len(str);
245 if (len == 0)
246 return -1;
247
248 /* ascii is valid */
249 if (len == 1)
250 return 1;
251
252 /* check if expected encoded chars are available */
253 for (i = 0; i < len; i++)
254 if ((str[i] & 0x80) != 0x80)
255 return -1;
256
257 unichar = utf8_encoded_to_unichar(str);
258
259 /* check if encoded length matches encoded value */
260 if (utf8_unichar_to_encoded_len(unichar) != len)
261 return -1;
262
263 /* check if value has valid range */
264 if (!is_unicode_valid(unichar))
265 return -1;
266
267 return len;
268}