]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blame - gnulib/import/mbrtowc.c
Move gnulib to top level
[thirdparty/binutils-gdb.git] / gnulib / import / mbrtowc.c
CommitLineData
8690e634 1/* Convert multibyte character to wide character.
5e8754f9 2 Copyright (C) 1999-2002, 2005-2016 Free Software Foundation, Inc.
8690e634
JK
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
5e8754f9 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
8690e634
JK
17
18#include <config.h>
19
20/* Specification. */
21#include <wchar.h>
22
49e4877c
PA
23#if C_LOCALE_MAYBE_EILSEQ
24# include "hard-locale.h"
25# include <locale.h>
26#endif
27
8690e634
JK
28#if GNULIB_defined_mbstate_t
29/* Implement mbrtowc() on top of mbtowc(). */
30
31# include <errno.h>
32# include <stdlib.h>
33
34# include "localcharset.h"
35# include "streq.h"
36# include "verify.h"
37
38
39verify (sizeof (mbstate_t) >= 4);
40
41static char internal_state[4];
42
43size_t
44mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
45{
46 char *pstate = (char *)ps;
47
48 if (s == NULL)
49 {
50 pwc = NULL;
51 s = "";
52 n = 1;
53 }
54
55 if (n == 0)
56 return (size_t)(-2);
57
58 /* Here n > 0. */
59
60 if (pstate == NULL)
61 pstate = internal_state;
62
63 {
64 size_t nstate = pstate[0];
65 char buf[4];
66 const char *p;
67 size_t m;
68
69 switch (nstate)
70 {
71 case 0:
72 p = s;
73 m = n;
74 break;
75 case 3:
76 buf[2] = pstate[3];
5e8754f9 77 /*FALLTHROUGH*/
8690e634
JK
78 case 2:
79 buf[1] = pstate[2];
5e8754f9 80 /*FALLTHROUGH*/
8690e634
JK
81 case 1:
82 buf[0] = pstate[1];
83 p = buf;
84 m = nstate;
85 buf[m++] = s[0];
86 if (n >= 2 && m < 4)
87 {
88 buf[m++] = s[1];
89 if (n >= 3 && m < 4)
90 buf[m++] = s[2];
91 }
92 break;
93 default:
94 errno = EINVAL;
95 return (size_t)(-1);
96 }
97
98 /* Here m > 0. */
99
100# if __GLIBC__ || defined __UCLIBC__
5e8754f9 101 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
8690e634
JK
102 mbtowc (NULL, NULL, 0);
103# endif
104 {
105 int res = mbtowc (pwc, p, m);
106
107 if (res >= 0)
108 {
109 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
110 abort ();
111 if (nstate >= (res > 0 ? res : 1))
112 abort ();
113 res -= nstate;
114 pstate[0] = 0;
115 return res;
116 }
117
118 /* mbtowc does not distinguish between invalid and incomplete multibyte
119 sequences. But mbrtowc needs to make this distinction.
120 There are two possible approaches:
121 - Use iconv() and its return value.
122 - Use built-in knowledge about the possible encodings.
123 Given the low quality of implementation of iconv() on the systems that
124 lack mbrtowc(), we use the second approach.
125 The possible encodings are:
126 - 8-bit encodings,
127 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
128 - UTF-8.
129 Use specialized code for each. */
130 if (m >= 4 || m >= MB_CUR_MAX)
131 goto invalid;
132 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
5e8754f9
SDJ
133 {
134 const char *encoding = locale_charset ();
135
136 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
8690e634
JK
137 {
138 /* Cf. unistr/u8-mblen.c. */
139 unsigned char c = (unsigned char) p[0];
140
141 if (c >= 0xc2)
142 {
143 if (c < 0xe0)
144 {
145 if (m == 1)
146 goto incomplete;
147 }
148 else if (c < 0xf0)
149 {
150 if (m == 1)
151 goto incomplete;
152 if (m == 2)
153 {
154 unsigned char c2 = (unsigned char) p[1];
155
156 if ((c2 ^ 0x80) < 0x40
157 && (c >= 0xe1 || c2 >= 0xa0)
158 && (c != 0xed || c2 < 0xa0))
159 goto incomplete;
160 }
161 }
162 else if (c <= 0xf4)
163 {
164 if (m == 1)
165 goto incomplete;
166 else /* m == 2 || m == 3 */
167 {
168 unsigned char c2 = (unsigned char) p[1];
169
170 if ((c2 ^ 0x80) < 0x40
171 && (c >= 0xf1 || c2 >= 0x90)
172 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
173 {
174 if (m == 2)
175 goto incomplete;
176 else /* m == 3 */
177 {
178 unsigned char c3 = (unsigned char) p[2];
179
180 if ((c3 ^ 0x80) < 0x40)
181 goto incomplete;
182 }
183 }
184 }
185 }
186 }
187 goto invalid;
188 }
189
190 /* As a reference for this code, you can use the GNU libiconv
191 implementation. Look for uses of the RET_TOOFEW macro. */
192
5e8754f9
SDJ
193 if (STREQ_OPT (encoding,
194 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
8690e634
JK
195 {
196 if (m == 1)
197 {
198 unsigned char c = (unsigned char) p[0];
199
200 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
201 goto incomplete;
202 }
203 if (m == 2)
204 {
205 unsigned char c = (unsigned char) p[0];
206
207 if (c == 0x8f)
208 {
209 unsigned char c2 = (unsigned char) p[1];
210
211 if (c2 >= 0xa1 && c2 < 0xff)
212 goto incomplete;
213 }
214 }
215 goto invalid;
216 }
5e8754f9
SDJ
217 if (STREQ_OPT (encoding,
218 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
219 || STREQ_OPT (encoding,
220 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
221 || STREQ_OPT (encoding,
222 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
8690e634
JK
223 {
224 if (m == 1)
225 {
226 unsigned char c = (unsigned char) p[0];
227
228 if (c >= 0xa1 && c < 0xff)
229 goto incomplete;
230 }
231 goto invalid;
232 }
5e8754f9
SDJ
233 if (STREQ_OPT (encoding,
234 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
8690e634
JK
235 {
236 if (m == 1)
237 {
238 unsigned char c = (unsigned char) p[0];
239
240 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
241 goto incomplete;
242 }
243 else /* m == 2 || m == 3 */
244 {
245 unsigned char c = (unsigned char) p[0];
246
247 if (c == 0x8e)
248 goto incomplete;
249 }
250 goto invalid;
251 }
5e8754f9
SDJ
252 if (STREQ_OPT (encoding,
253 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
8690e634
JK
254 {
255 if (m == 1)
256 {
257 unsigned char c = (unsigned char) p[0];
258
259 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
260 goto incomplete;
261 }
262 else /* m == 2 || m == 3 */
263 {
264 unsigned char c = (unsigned char) p[0];
265
266 if (c >= 0x90 && c <= 0xe3)
267 {
268 unsigned char c2 = (unsigned char) p[1];
269
270 if (c2 >= 0x30 && c2 <= 0x39)
271 {
272 if (m == 2)
273 goto incomplete;
274 else /* m == 3 */
275 {
276 unsigned char c3 = (unsigned char) p[2];
277
278 if (c3 >= 0x81 && c3 <= 0xfe)
279 goto incomplete;
280 }
281 }
282 }
283 }
284 goto invalid;
285 }
5e8754f9 286 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
8690e634
JK
287 {
288 if (m == 1)
289 {
290 unsigned char c = (unsigned char) p[0];
291
292 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
293 || (c >= 0xf0 && c <= 0xf9))
294 goto incomplete;
295 }
296 goto invalid;
297 }
298
5e8754f9
SDJ
299 /* An unknown multibyte encoding. */
300 goto incomplete;
301 }
8690e634
JK
302
303 incomplete:
304 {
305 size_t k = nstate;
306 /* Here 0 <= k < m < 4. */
307 pstate[++k] = s[0];
308 if (k < m)
309 {
310 pstate[++k] = s[1];
311 if (k < m)
312 pstate[++k] = s[2];
313 }
314 if (k != m)
315 abort ();
316 }
317 pstate[0] = m;
318 return (size_t)(-2);
319
320 invalid:
321 errno = EILSEQ;
322 /* The conversion state is undefined, says POSIX. */
323 return (size_t)(-1);
324 }
325 }
326}
327
328#else
329/* Override the system's mbrtowc() function. */
330
331# undef mbrtowc
332
333size_t
334rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
335{
49e4877c
PA
336 size_t ret;
337 wchar_t wc;
338
4a626d0a 339# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
8690e634
JK
340 if (s == NULL)
341 {
342 pwc = NULL;
343 s = "";
344 n = 1;
345 }
346# endif
347
4a626d0a
PA
348# if MBRTOWC_EMPTY_INPUT_BUG
349 if (n == 0)
350 return (size_t) -2;
351# endif
352
49e4877c
PA
353 if (! pwc)
354 pwc = &wc;
355
8690e634
JK
356# if MBRTOWC_RETVAL_BUG
357 {
358 static mbstate_t internal_state;
359
360 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
361 hidden internal state, but we can call it on our variable. */
362 if (ps == NULL)
363 ps = &internal_state;
364
365 if (!mbsinit (ps))
366 {
367 /* Parse the rest of the multibyte character byte for byte. */
368 size_t count = 0;
369 for (; n > 0; s++, n--)
370 {
49e4877c 371 ret = mbrtowc (&wc, s, 1, ps);
8690e634
JK
372
373 if (ret == (size_t)(-1))
374 return (size_t)(-1);
375 count++;
376 if (ret != (size_t)(-2))
377 {
378 /* The multibyte character has been completed. */
49e4877c 379 *pwc = wc;
8690e634
JK
380 return (wc == 0 ? 0 : count);
381 }
382 }
383 return (size_t)(-2);
384 }
385 }
386# endif
387
49e4877c 388 ret = mbrtowc (pwc, s, n, ps);
8690e634 389
49e4877c
PA
390# if MBRTOWC_NUL_RETVAL_BUG
391 if (ret < (size_t) -2 && !*pwc)
392 return 0;
393# endif
8690e634 394
49e4877c
PA
395# if C_LOCALE_MAYBE_EILSEQ
396 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
397 {
398 unsigned char uc = *s;
399 *pwc = uc;
400 return 1;
401 }
8690e634 402# endif
49e4877c
PA
403
404 return ret;
8690e634
JK
405}
406
407#endif