]> git.ipfire.org Git - thirdparty/bash.git/blame - lib/sh/unicode.c
Bash-4.3 patch 35
[thirdparty/bash.git] / lib / sh / unicode.c
CommitLineData
495aee44
CR
1/* unicode.c - functions to convert unicode characters */
2
ac50fbac 3/* Copyright (C) 2010-2012 Free Software Foundation, Inc.
495aee44
CR
4
5 This file is part of GNU Bash, the Bourne Again SHell.
6
7 Bash is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 Bash is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Bash. If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include <config.h>
22
23#if defined (HANDLE_MULTIBYTE)
24
25#include <stdc.h>
26#include <wchar.h>
27#include <bashansi.h>
28#ifdef HAVE_UNISTD_H
29#include <unistd.h>
30#endif
ac50fbac 31#include <stdio.h>
495aee44
CR
32#include <limits.h>
33
34#if HAVE_ICONV
35# include <iconv.h>
36#endif
37
38#include <xmalloc.h>
39
40#ifndef USHORT_MAX
41# ifdef USHRT_MAX
42# define USHORT_MAX USHRT_MAX
43# else
44# define USHORT_MAX ((unsigned short) ~(unsigned short)0)
45# endif
46#endif
47
48#if !defined (STREQ)
49# define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
50#endif /* !STREQ */
51
52#if defined (HAVE_LOCALE_CHARSET)
53extern const char *locale_charset __P((void));
54#else
55extern char *get_locale_var __P((char *));
56#endif
57
58static int u32init = 0;
59static int utf8locale = 0;
60#if defined (HAVE_ICONV)
61static iconv_t localconv;
62#endif
63
64#ifndef HAVE_LOCALE_CHARSET
ac50fbac
CR
65static char charsetbuf[40];
66
495aee44
CR
67static char *
68stub_charset ()
69{
70 char *locale, *s, *t;
71
72 locale = get_locale_var ("LC_CTYPE");
73 if (locale == 0 || *locale == 0)
ac50fbac
CR
74 {
75 strcpy (charsetbuf, "ASCII");
76 return charsetbuf;
77 }
495aee44
CR
78 s = strrchr (locale, '.');
79 if (s)
80 {
e57a256f
CR
81 strncpy (charsetbuf, s+1, sizeof (charsetbuf) - 1);
82 charsetbuf[sizeof (charsetbuf) - 1] = '\0';
ac50fbac 83 t = strchr (charsetbuf, '@');
495aee44
CR
84 if (t)
85 *t = 0;
ac50fbac 86 return charsetbuf;
495aee44 87 }
e57a256f
CR
88 strncpy (charsetbuf, locale, sizeof (charsetbuf) - 1);
89 charsetbuf[sizeof (charsetbuf) - 1] = '\0';
ac50fbac 90 return charsetbuf;
495aee44
CR
91}
92#endif
93
ac50fbac
CR
94void
95u32reset ()
96{
97#if defined (HAVE_ICONV)
98 if (u32init && localconv != (iconv_t)-1)
99 {
100 iconv_close (localconv);
101 localconv = (iconv_t)-1;
102 }
103#endif
104 u32init = 0;
105 utf8locale = 0;
106}
107
495aee44
CR
108/* u32toascii ? */
109int
ac50fbac
CR
110u32tochar (x, s)
111 unsigned long x;
495aee44
CR
112 char *s;
113{
495aee44
CR
114 int l;
115
495aee44
CR
116 l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4);
117
118 if (x <= UCHAR_MAX)
119 s[0] = x & 0xFF;
120 else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */
121 {
122 s[0] = (x >> 8) & 0xFF;
123 s[1] = x & 0xFF;
124 }
125 else
126 {
127 s[0] = (x >> 24) & 0xFF;
128 s[1] = (x >> 16) & 0xFF;
129 s[2] = (x >> 8) & 0xFF;
130 s[3] = x & 0xFF;
131 }
132 s[l] = '\0';
133 return l;
134}
135
136int
ac50fbac
CR
137u32tocesc (wc, s)
138 u_bits32_t wc;
495aee44
CR
139 char *s;
140{
141 int l;
142
ac50fbac
CR
143 if (wc < 0x10000)
144 l = sprintf (s, "\\u%04X", wc);
145 else
146 l = sprintf (s, "\\u%08X", wc);
147 return l;
148}
149
150/* Convert unsigned 32-bit int to utf-8 character string */
151int
152u32toutf8 (wc, s)
153 u_bits32_t wc;
154 char *s;
155{
156 int l;
495aee44
CR
157
158 if (wc < 0x0080)
ac50fbac
CR
159 {
160 s[0] = (char)wc;
161 l = 1;
162 }
495aee44
CR
163 else if (wc < 0x0800)
164 {
165 s[0] = (wc >> 6) | 0xc0;
166 s[1] = (wc & 0x3f) | 0x80;
ac50fbac 167 l = 2;
495aee44 168 }
ac50fbac 169 else if (wc < 0x10000)
495aee44 170 {
ac50fbac 171 /* Technically, we could return 0 here if 0xd800 <= wc <= 0x0dfff */
495aee44
CR
172 s[0] = (wc >> 12) | 0xe0;
173 s[1] = ((wc >> 6) & 0x3f) | 0x80;
174 s[2] = (wc & 0x3f) | 0x80;
ac50fbac
CR
175 l = 3;
176 }
177 else if (wc < 0x200000)
178 {
179 s[0] = (wc >> 18) | 0xf0;
180 s[1] = ((wc >> 12) & 0x3f) | 0x80;
181 s[2] = ((wc >> 6) & 0x3f) | 0x80;
182 s[3] = (wc & 0x3f) | 0x80;
183 l = 4;
495aee44 184 }
ac50fbac
CR
185 /* Strictly speaking, UTF-8 doesn't have characters longer than 4 bytes */
186 else if (wc < 0x04000000)
187 {
188 s[0] = (wc >> 24) | 0xf8;
189 s[1] = ((wc >> 18) & 0x3f) | 0x80;
190 s[2] = ((wc >> 12) & 0x3f) | 0x80;
191 s[3] = ((wc >> 6) & 0x3f) | 0x80;
192 s[4] = (wc & 0x3f) | 0x80;
193 l = 5;
194 }
195 else if (wc < 0x080000000)
196 {
197 s[0] = (wc >> 30) | 0xf8;
198 s[1] = ((wc >> 24) & 0x3f) | 0x80;
199 s[2] = ((wc >> 18) & 0x3f) | 0x80;
200 s[3] = ((wc >> 12) & 0x3f) | 0x80;
201 s[4] = ((wc >> 6) & 0x3f) | 0x80;
202 s[5] = (wc & 0x3f) | 0x80;
203 l = 6;
204 }
205 else
206 l = 0;
207
495aee44
CR
208 s[l] = '\0';
209 return l;
210}
211
ac50fbac
CR
212/* Convert a 32-bit unsigned int (unicode) to a UTF-16 string. Rarely used,
213 only if sizeof(wchar_t) == 2. */
214int
215u32toutf16 (c, s)
216 u_bits32_t c;
217 unsigned short *s;
218{
219 int l;
220
221 l = 0;
222 if (c < 0x0d800)
223 {
224 s[0] = (unsigned short) (c & 0xFFFF);
225 l = 1;
226 }
227 else if (c >= 0x0e000 && c <= 0x010ffff)
228 {
229 c -= 0x010000;
230 s[0] = (unsigned short)((c >> 10) + 0xd800);
231 s[1] = (unsigned short)((c & 0x3ff) + 0xdc00);
232 l = 2;
233 }
234 s[l] = 0;
235 return l;
236}
237
495aee44
CR
238/* convert a single unicode-32 character into a multibyte string and put the
239 result in S, which must be large enough (at least MB_LEN_MAX bytes) */
240int
241u32cconv (c, s)
242 unsigned long c;
243 char *s;
244{
245 wchar_t wc;
ac50fbac 246 wchar_t ws[3];
495aee44
CR
247 int n;
248#if HAVE_ICONV
249 const char *charset;
250 char obuf[25], *optr;
251 size_t obytesleft;
252 const char *iptr;
253 size_t sn;
254#endif
255
495aee44 256#if __STDC_ISO_10646__
ac50fbac
CR
257 wc = c;
258 if (sizeof (wchar_t) == 4 && c <= 0x7fffffff)
259 n = wctomb (s, wc);
260 else if (sizeof (wchar_t) == 2 && c <= 0x10ffff && u32toutf16 (c, ws))
261 n = wcstombs (s, ws, MB_LEN_MAX);
262 else
263 n = -1;
264 if (n != -1)
265 return n;
495aee44
CR
266#endif
267
268#if HAVE_NL_LANGINFO
269 codeset = nl_langinfo (CODESET);
270 if (STREQ (codeset, "UTF-8"))
271 {
ac50fbac 272 n = u32toutf8 (c, s);
495aee44
CR
273 return n;
274 }
275#endif
276
277#if HAVE_ICONV
278 /* this is mostly from coreutils-8.5/lib/unicodeio.c */
279 if (u32init == 0)
280 {
281# if HAVE_LOCALE_CHARSET
282 charset = locale_charset (); /* XXX - fix later */
283# else
284 charset = stub_charset ();
285# endif
286 if (STREQ (charset, "UTF-8"))
287 utf8locale = 1;
288 else
289 {
290 localconv = iconv_open (charset, "UTF-8");
291 if (localconv == (iconv_t)-1)
ac50fbac
CR
292 /* We assume ASCII when presented with an unknown encoding. */
293 localconv = iconv_open ("ASCII", "UTF-8");
495aee44
CR
294 }
295 u32init = 1;
296 }
297
ac50fbac
CR
298 /* If we have a UTF-8 locale, convert to UTF-8 and return converted value. */
299 n = u32toutf8 (c, s);
495aee44 300 if (utf8locale)
ac50fbac 301 return n;
495aee44 302
ac50fbac
CR
303 /* If the conversion is not supported, even the ASCII requested above, we
304 bail now. Currently we return the UTF-8 conversion. We could return
305 u32tocesc(). */
495aee44 306 if (localconv == (iconv_t)-1)
ac50fbac
CR
307 return n;
308
495aee44
CR
309 optr = obuf;
310 obytesleft = sizeof (obuf);
311 iptr = s;
312 sn = n;
313
314 iconv (localconv, NULL, NULL, NULL, NULL);
315
316 if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1)
ac50fbac
CR
317 {
318#if 1
319 /* You get ISO C99 escape sequences if iconv fails */
320 n = u32tocesc (c, s);
321#else
322 /* You get UTF-8 if iconv fails */
323#endif
324 return n;
325 }
495aee44
CR
326
327 *optr = '\0';
328
329 /* number of chars to be copied is optr - obuf if we want to do bounds
330 checking */
331 strcpy (s, obuf);
332 return (optr - obuf);
ac50fbac 333#endif /* HAVE_ICONV */
495aee44 334
ac50fbac 335 n = u32tocesc (c, s); /* fallback is ISO C99 escape sequences */
495aee44
CR
336 return n;
337}
ac50fbac
CR
338#else
339void
340u32reset ()
341{
342}
495aee44 343#endif /* HANDLE_MULTIBYTE */