]>
Commit | Line | Data |
---|---|---|
495aee44 CR |
1 | /* unicode.c - functions to convert unicode characters */ |
2 | ||
ac50fbac | 3 | /* Copyright (C) 2010-2012 Free Software Foundation, Inc. |
495aee44 CR |
4 | |
5 | This file is part of GNU Bash, the Bourne Again SHell. | |
6 | ||
7 | Bash is free software: you can redistribute it and/or modify | |
8 | it under the terms of the GNU General Public License as published by | |
9 | the Free Software Foundation, either version 3 of the License, or | |
10 | (at your option) any later version. | |
11 | ||
12 | Bash is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | GNU General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with Bash. If not, see <http://www.gnu.org/licenses/>. | |
19 | */ | |
20 | ||
21 | #include <config.h> | |
22 | ||
23 | #if defined (HANDLE_MULTIBYTE) | |
24 | ||
25 | #include <stdc.h> | |
26 | #include <wchar.h> | |
27 | #include <bashansi.h> | |
28 | #ifdef HAVE_UNISTD_H | |
29 | #include <unistd.h> | |
30 | #endif | |
ac50fbac | 31 | #include <stdio.h> |
495aee44 CR |
32 | #include <limits.h> |
33 | ||
34 | #if HAVE_ICONV | |
35 | # include <iconv.h> | |
36 | #endif | |
37 | ||
38 | #include <xmalloc.h> | |
39 | ||
40 | #ifndef USHORT_MAX | |
41 | # ifdef USHRT_MAX | |
42 | # define USHORT_MAX USHRT_MAX | |
43 | # else | |
44 | # define USHORT_MAX ((unsigned short) ~(unsigned short)0) | |
45 | # endif | |
46 | #endif | |
47 | ||
48 | #if !defined (STREQ) | |
49 | # define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0) | |
50 | #endif /* !STREQ */ | |
51 | ||
52 | #if defined (HAVE_LOCALE_CHARSET) | |
53 | extern const char *locale_charset __P((void)); | |
54 | #else | |
55 | extern char *get_locale_var __P((char *)); | |
56 | #endif | |
57 | ||
58 | static int u32init = 0; | |
59 | static int utf8locale = 0; | |
60 | #if defined (HAVE_ICONV) | |
61 | static iconv_t localconv; | |
62 | #endif | |
63 | ||
64 | #ifndef HAVE_LOCALE_CHARSET | |
ac50fbac CR |
65 | static char charsetbuf[40]; |
66 | ||
495aee44 CR |
67 | static char * |
68 | stub_charset () | |
69 | { | |
70 | char *locale, *s, *t; | |
71 | ||
72 | locale = get_locale_var ("LC_CTYPE"); | |
73 | if (locale == 0 || *locale == 0) | |
ac50fbac CR |
74 | { |
75 | strcpy (charsetbuf, "ASCII"); | |
76 | return charsetbuf; | |
77 | } | |
495aee44 CR |
78 | s = strrchr (locale, '.'); |
79 | if (s) | |
80 | { | |
e57a256f CR |
81 | strncpy (charsetbuf, s+1, sizeof (charsetbuf) - 1); |
82 | charsetbuf[sizeof (charsetbuf) - 1] = '\0'; | |
ac50fbac | 83 | t = strchr (charsetbuf, '@'); |
495aee44 CR |
84 | if (t) |
85 | *t = 0; | |
ac50fbac | 86 | return charsetbuf; |
495aee44 | 87 | } |
e57a256f CR |
88 | strncpy (charsetbuf, locale, sizeof (charsetbuf) - 1); |
89 | charsetbuf[sizeof (charsetbuf) - 1] = '\0'; | |
ac50fbac | 90 | return charsetbuf; |
495aee44 CR |
91 | } |
92 | #endif | |
93 | ||
ac50fbac CR |
94 | void |
95 | u32reset () | |
96 | { | |
97 | #if defined (HAVE_ICONV) | |
98 | if (u32init && localconv != (iconv_t)-1) | |
99 | { | |
100 | iconv_close (localconv); | |
101 | localconv = (iconv_t)-1; | |
102 | } | |
103 | #endif | |
104 | u32init = 0; | |
105 | utf8locale = 0; | |
106 | } | |
107 | ||
495aee44 CR |
108 | /* u32toascii ? */ |
109 | int | |
ac50fbac CR |
110 | u32tochar (x, s) |
111 | unsigned long x; | |
495aee44 CR |
112 | char *s; |
113 | { | |
495aee44 CR |
114 | int l; |
115 | ||
495aee44 CR |
116 | l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4); |
117 | ||
118 | if (x <= UCHAR_MAX) | |
119 | s[0] = x & 0xFF; | |
120 | else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */ | |
121 | { | |
122 | s[0] = (x >> 8) & 0xFF; | |
123 | s[1] = x & 0xFF; | |
124 | } | |
125 | else | |
126 | { | |
127 | s[0] = (x >> 24) & 0xFF; | |
128 | s[1] = (x >> 16) & 0xFF; | |
129 | s[2] = (x >> 8) & 0xFF; | |
130 | s[3] = x & 0xFF; | |
131 | } | |
132 | s[l] = '\0'; | |
133 | return l; | |
134 | } | |
135 | ||
136 | int | |
ac50fbac CR |
137 | u32tocesc (wc, s) |
138 | u_bits32_t wc; | |
495aee44 CR |
139 | char *s; |
140 | { | |
141 | int l; | |
142 | ||
ac50fbac CR |
143 | if (wc < 0x10000) |
144 | l = sprintf (s, "\\u%04X", wc); | |
145 | else | |
146 | l = sprintf (s, "\\u%08X", wc); | |
147 | return l; | |
148 | } | |
149 | ||
150 | /* Convert unsigned 32-bit int to utf-8 character string */ | |
151 | int | |
152 | u32toutf8 (wc, s) | |
153 | u_bits32_t wc; | |
154 | char *s; | |
155 | { | |
156 | int l; | |
495aee44 CR |
157 | |
158 | if (wc < 0x0080) | |
ac50fbac CR |
159 | { |
160 | s[0] = (char)wc; | |
161 | l = 1; | |
162 | } | |
495aee44 CR |
163 | else if (wc < 0x0800) |
164 | { | |
165 | s[0] = (wc >> 6) | 0xc0; | |
166 | s[1] = (wc & 0x3f) | 0x80; | |
ac50fbac | 167 | l = 2; |
495aee44 | 168 | } |
ac50fbac | 169 | else if (wc < 0x10000) |
495aee44 | 170 | { |
ac50fbac | 171 | /* Technically, we could return 0 here if 0xd800 <= wc <= 0x0dfff */ |
495aee44 CR |
172 | s[0] = (wc >> 12) | 0xe0; |
173 | s[1] = ((wc >> 6) & 0x3f) | 0x80; | |
174 | s[2] = (wc & 0x3f) | 0x80; | |
ac50fbac CR |
175 | l = 3; |
176 | } | |
177 | else if (wc < 0x200000) | |
178 | { | |
179 | s[0] = (wc >> 18) | 0xf0; | |
180 | s[1] = ((wc >> 12) & 0x3f) | 0x80; | |
181 | s[2] = ((wc >> 6) & 0x3f) | 0x80; | |
182 | s[3] = (wc & 0x3f) | 0x80; | |
183 | l = 4; | |
495aee44 | 184 | } |
ac50fbac CR |
185 | /* Strictly speaking, UTF-8 doesn't have characters longer than 4 bytes */ |
186 | else if (wc < 0x04000000) | |
187 | { | |
188 | s[0] = (wc >> 24) | 0xf8; | |
189 | s[1] = ((wc >> 18) & 0x3f) | 0x80; | |
190 | s[2] = ((wc >> 12) & 0x3f) | 0x80; | |
191 | s[3] = ((wc >> 6) & 0x3f) | 0x80; | |
192 | s[4] = (wc & 0x3f) | 0x80; | |
193 | l = 5; | |
194 | } | |
195 | else if (wc < 0x080000000) | |
196 | { | |
197 | s[0] = (wc >> 30) | 0xf8; | |
198 | s[1] = ((wc >> 24) & 0x3f) | 0x80; | |
199 | s[2] = ((wc >> 18) & 0x3f) | 0x80; | |
200 | s[3] = ((wc >> 12) & 0x3f) | 0x80; | |
201 | s[4] = ((wc >> 6) & 0x3f) | 0x80; | |
202 | s[5] = (wc & 0x3f) | 0x80; | |
203 | l = 6; | |
204 | } | |
205 | else | |
206 | l = 0; | |
207 | ||
495aee44 CR |
208 | s[l] = '\0'; |
209 | return l; | |
210 | } | |
211 | ||
ac50fbac CR |
212 | /* Convert a 32-bit unsigned int (unicode) to a UTF-16 string. Rarely used, |
213 | only if sizeof(wchar_t) == 2. */ | |
214 | int | |
215 | u32toutf16 (c, s) | |
216 | u_bits32_t c; | |
217 | unsigned short *s; | |
218 | { | |
219 | int l; | |
220 | ||
221 | l = 0; | |
222 | if (c < 0x0d800) | |
223 | { | |
224 | s[0] = (unsigned short) (c & 0xFFFF); | |
225 | l = 1; | |
226 | } | |
227 | else if (c >= 0x0e000 && c <= 0x010ffff) | |
228 | { | |
229 | c -= 0x010000; | |
230 | s[0] = (unsigned short)((c >> 10) + 0xd800); | |
231 | s[1] = (unsigned short)((c & 0x3ff) + 0xdc00); | |
232 | l = 2; | |
233 | } | |
234 | s[l] = 0; | |
235 | return l; | |
236 | } | |
237 | ||
495aee44 CR |
238 | /* convert a single unicode-32 character into a multibyte string and put the |
239 | result in S, which must be large enough (at least MB_LEN_MAX bytes) */ | |
240 | int | |
241 | u32cconv (c, s) | |
242 | unsigned long c; | |
243 | char *s; | |
244 | { | |
245 | wchar_t wc; | |
ac50fbac | 246 | wchar_t ws[3]; |
495aee44 CR |
247 | int n; |
248 | #if HAVE_ICONV | |
249 | const char *charset; | |
250 | char obuf[25], *optr; | |
251 | size_t obytesleft; | |
252 | const char *iptr; | |
253 | size_t sn; | |
254 | #endif | |
255 | ||
495aee44 | 256 | #if __STDC_ISO_10646__ |
ac50fbac CR |
257 | wc = c; |
258 | if (sizeof (wchar_t) == 4 && c <= 0x7fffffff) | |
259 | n = wctomb (s, wc); | |
260 | else if (sizeof (wchar_t) == 2 && c <= 0x10ffff && u32toutf16 (c, ws)) | |
261 | n = wcstombs (s, ws, MB_LEN_MAX); | |
262 | else | |
263 | n = -1; | |
264 | if (n != -1) | |
265 | return n; | |
495aee44 CR |
266 | #endif |
267 | ||
268 | #if HAVE_NL_LANGINFO | |
269 | codeset = nl_langinfo (CODESET); | |
270 | if (STREQ (codeset, "UTF-8")) | |
271 | { | |
ac50fbac | 272 | n = u32toutf8 (c, s); |
495aee44 CR |
273 | return n; |
274 | } | |
275 | #endif | |
276 | ||
277 | #if HAVE_ICONV | |
278 | /* this is mostly from coreutils-8.5/lib/unicodeio.c */ | |
279 | if (u32init == 0) | |
280 | { | |
281 | # if HAVE_LOCALE_CHARSET | |
282 | charset = locale_charset (); /* XXX - fix later */ | |
283 | # else | |
284 | charset = stub_charset (); | |
285 | # endif | |
286 | if (STREQ (charset, "UTF-8")) | |
287 | utf8locale = 1; | |
288 | else | |
289 | { | |
290 | localconv = iconv_open (charset, "UTF-8"); | |
291 | if (localconv == (iconv_t)-1) | |
ac50fbac CR |
292 | /* We assume ASCII when presented with an unknown encoding. */ |
293 | localconv = iconv_open ("ASCII", "UTF-8"); | |
495aee44 CR |
294 | } |
295 | u32init = 1; | |
296 | } | |
297 | ||
ac50fbac CR |
298 | /* If we have a UTF-8 locale, convert to UTF-8 and return converted value. */ |
299 | n = u32toutf8 (c, s); | |
495aee44 | 300 | if (utf8locale) |
ac50fbac | 301 | return n; |
495aee44 | 302 | |
ac50fbac CR |
303 | /* If the conversion is not supported, even the ASCII requested above, we |
304 | bail now. Currently we return the UTF-8 conversion. We could return | |
305 | u32tocesc(). */ | |
495aee44 | 306 | if (localconv == (iconv_t)-1) |
ac50fbac CR |
307 | return n; |
308 | ||
495aee44 CR |
309 | optr = obuf; |
310 | obytesleft = sizeof (obuf); | |
311 | iptr = s; | |
312 | sn = n; | |
313 | ||
314 | iconv (localconv, NULL, NULL, NULL, NULL); | |
315 | ||
316 | if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1) | |
ac50fbac CR |
317 | { |
318 | #if 1 | |
319 | /* You get ISO C99 escape sequences if iconv fails */ | |
320 | n = u32tocesc (c, s); | |
321 | #else | |
322 | /* You get UTF-8 if iconv fails */ | |
323 | #endif | |
324 | return n; | |
325 | } | |
495aee44 CR |
326 | |
327 | *optr = '\0'; | |
328 | ||
329 | /* number of chars to be copied is optr - obuf if we want to do bounds | |
330 | checking */ | |
331 | strcpy (s, obuf); | |
332 | return (optr - obuf); | |
ac50fbac | 333 | #endif /* HAVE_ICONV */ |
495aee44 | 334 | |
ac50fbac | 335 | n = u32tocesc (c, s); /* fallback is ISO C99 escape sequences */ |
495aee44 CR |
336 | return n; |
337 | } | |
ac50fbac CR |
338 | #else |
339 | void | |
340 | u32reset () | |
341 | { | |
342 | } | |
495aee44 | 343 | #endif /* HANDLE_MULTIBYTE */ |