]>
Commit | Line | Data |
---|---|---|
f957edde AK |
1 | /* Conversion between UTF-8 and UTF-32 BE/internal. |
2 | ||
3 | This module uses the Z9-109 variants of the Convert Unicode | |
4 | instructions. | |
d4697bc9 | 5 | Copyright (C) 1997-2014 Free Software Foundation, Inc. |
f957edde AK |
6 | |
7 | Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com> | |
8 | Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997. | |
9 | ||
10 | Thanks to Daniel Appich who covered the relevant performance work | |
11 | in his diploma thesis. | |
12 | ||
13 | This is free software; you can redistribute it and/or | |
14 | modify it under the terms of the GNU Lesser General Public | |
15 | License as published by the Free Software Foundation; either | |
16 | version 2.1 of the License, or (at your option) any later version. | |
17 | ||
18 | This is distributed in the hope that it will be useful, | |
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 | Lesser General Public License for more details. | |
22 | ||
23 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
24 | License along with the GNU C Library; if not, see |
25 | <http://www.gnu.org/licenses/>. */ | |
f957edde AK |
26 | |
27 | #include <dlfcn.h> | |
28 | #include <stdint.h> | |
29 | #include <unistd.h> | |
30 | #include <dl-procinfo.h> | |
31 | #include <gconv.h> | |
32 | ||
33 | /* UTF-32 big endian byte order mark. */ | |
34 | #define BOM 0x0000feffu | |
35 | ||
36 | #define DEFINE_INIT 0 | |
37 | #define DEFINE_FINI 0 | |
38 | /* These definitions apply to the UTF-8 to UTF-32 direction. The | |
39 | software implementation for UTF-8 still supports multibyte | |
40 | characters up to 6 bytes whereas the hardware variant does not. */ | |
41 | #define MIN_NEEDED_FROM 1 | |
42 | #define MAX_NEEDED_FROM 6 | |
43 | #define MIN_NEEDED_TO 4 | |
44 | #define FROM_LOOP from_utf8_loop | |
45 | #define TO_LOOP to_utf8_loop | |
46 | #define FROM_DIRECTION (dir == from_utf8) | |
47 | #define PREPARE_LOOP \ | |
48 | enum direction dir = ((struct utf8_data *) step->__data)->dir; \ | |
49 | int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \ | |
50 | \ | |
51 | if (emit_bom && !data->__internal_use \ | |
52 | && data->__invocation_counter == 0) \ | |
53 | { \ | |
54 | /* Emit the Byte Order Mark. */ \ | |
a1ffb40e | 55 | if (__glibc_unlikely (outbuf + 4 > outend)) \ |
f957edde | 56 | return __GCONV_FULL_OUTPUT; \ |
7c36ced0 | 57 | \ |
f957edde AK |
58 | put32u (outbuf, BOM); \ |
59 | outbuf += 4; \ | |
60 | } | |
61 | ||
62 | /* Direction of the transformation. */ | |
63 | enum direction | |
64 | { | |
65 | illegal_dir, | |
66 | to_utf8, | |
67 | from_utf8 | |
68 | }; | |
69 | ||
70 | struct utf8_data | |
71 | { | |
72 | enum direction dir; | |
73 | int emit_bom; | |
74 | }; | |
75 | ||
76 | ||
77 | extern int gconv_init (struct __gconv_step *step); | |
78 | int | |
79 | gconv_init (struct __gconv_step *step) | |
80 | { | |
81 | /* Determine which direction. */ | |
82 | struct utf8_data *new_data; | |
83 | enum direction dir = illegal_dir; | |
84 | int emit_bom; | |
85 | int result; | |
86 | ||
87 | emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0); | |
88 | ||
89 | if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0 | |
90 | && (__strcasecmp (step->__to_name, "UTF-32//") == 0 | |
91 | || __strcasecmp (step->__to_name, "UTF-32BE//") == 0 | |
7c36ced0 | 92 | || __strcasecmp (step->__to_name, "INTERNAL") == 0)) |
f957edde AK |
93 | { |
94 | dir = from_utf8; | |
95 | } | |
96 | else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0 | |
97 | && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0 | |
98 | || __strcasecmp (step->__from_name, "INTERNAL") == 0)) | |
99 | { | |
100 | dir = to_utf8; | |
101 | } | |
102 | ||
103 | result = __GCONV_NOCONV; | |
104 | if (dir != illegal_dir) | |
105 | { | |
106 | new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data)); | |
107 | ||
108 | result = __GCONV_NOMEM; | |
109 | if (new_data != NULL) | |
110 | { | |
111 | new_data->dir = dir; | |
112 | new_data->emit_bom = emit_bom; | |
113 | step->__data = new_data; | |
114 | ||
115 | if (dir == from_utf8) | |
116 | { | |
117 | step->__min_needed_from = MIN_NEEDED_FROM; | |
118 | step->__max_needed_from = MIN_NEEDED_FROM; | |
119 | step->__min_needed_to = MIN_NEEDED_TO; | |
120 | step->__max_needed_to = MIN_NEEDED_TO; | |
121 | } | |
122 | else | |
123 | { | |
124 | step->__min_needed_from = MIN_NEEDED_TO; | |
125 | step->__max_needed_from = MIN_NEEDED_TO; | |
126 | step->__min_needed_to = MIN_NEEDED_FROM; | |
127 | step->__max_needed_to = MIN_NEEDED_FROM; | |
128 | } | |
129 | ||
130 | step->__stateful = 0; | |
131 | ||
132 | result = __GCONV_OK; | |
133 | } | |
134 | } | |
135 | ||
136 | return result; | |
137 | } | |
138 | ||
139 | ||
140 | extern void gconv_end (struct __gconv_step *data); | |
141 | void | |
142 | gconv_end (struct __gconv_step *data) | |
143 | { | |
144 | free (data->__data); | |
145 | } | |
146 | ||
147 | /* The macro for the hardware loop. This is used for both | |
148 | directions. */ | |
149 | #define HARDWARE_CONVERT(INSTRUCTION) \ | |
150 | { \ | |
151 | register const unsigned char* pInput asm ("8") = inptr; \ | |
152 | register unsigned long long inlen asm ("9") = inend - inptr; \ | |
153 | register unsigned char* pOutput asm ("10") = outptr; \ | |
154 | register unsigned long long outlen asm("11") = outend - outptr; \ | |
155 | uint64_t cc = 0; \ | |
156 | \ | |
27390476 AK |
157 | asm volatile (".machine push \n\t" \ |
158 | ".machine \"z9-109\" \n\t" \ | |
159 | "0: " INSTRUCTION " \n\t" \ | |
160 | ".machine pop \n\t" \ | |
f957edde AK |
161 | " jo 0b \n\t" \ |
162 | " ipm %2 \n" \ | |
163 | : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ | |
164 | "+d" (outlen), "+d" (inlen) \ | |
165 | : \ | |
166 | : "cc", "memory"); \ | |
167 | \ | |
168 | inptr = pInput; \ | |
169 | outptr = pOutput; \ | |
7c36ced0 | 170 | cc >>= 28; \ |
f957edde AK |
171 | \ |
172 | if (cc == 1) \ | |
173 | { \ | |
174 | result = __GCONV_FULL_OUTPUT; \ | |
175 | break; \ | |
176 | } \ | |
177 | else if (cc == 2) \ | |
178 | { \ | |
179 | result = __GCONV_ILLEGAL_INPUT; \ | |
180 | break; \ | |
181 | } \ | |
182 | } | |
183 | ||
184 | /* Conversion function from UTF-8 to UTF-32 internal/BE. */ | |
185 | ||
186 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
187 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM | |
188 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
189 | #define LOOPFCT FROM_LOOP | |
190 | /* The software routine is copied from gconv_simple.c. */ | |
7c36ced0 | 191 | #define BODY \ |
f957edde AK |
192 | { \ |
193 | if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ | |
194 | { \ | |
195 | HARDWARE_CONVERT ("cu14 %0, %1, 1"); \ | |
196 | \ | |
197 | if (inptr != inend) \ | |
198 | { \ | |
199 | int i; \ | |
200 | for (i = 1; inptr + i < inend; ++i) \ | |
201 | if ((inptr[i] & 0xc0) != 0x80) \ | |
202 | break; \ | |
203 | \ | |
a1ffb40e | 204 | if (__glibc_likely (inptr + i == inend)) \ |
f957edde AK |
205 | { \ |
206 | result = __GCONV_INCOMPLETE_INPUT; \ | |
207 | break; \ | |
208 | } \ | |
209 | STANDARD_FROM_LOOP_ERR_HANDLER (i); \ | |
210 | } \ | |
211 | continue; \ | |
212 | } \ | |
213 | \ | |
214 | /* Next input byte. */ \ | |
215 | uint32_t ch = *inptr; \ | |
216 | \ | |
a1ffb40e | 217 | if (__glibc_likely (ch < 0x80)) \ |
f957edde AK |
218 | { \ |
219 | /* One byte sequence. */ \ | |
220 | ++inptr; \ | |
221 | } \ | |
222 | else \ | |
223 | { \ | |
224 | uint_fast32_t cnt; \ | |
225 | uint_fast32_t i; \ | |
226 | \ | |
227 | if (ch >= 0xc2 && ch < 0xe0) \ | |
228 | { \ | |
229 | /* We expect two bytes. The first byte cannot be 0xc0 or \ | |
230 | 0xc1, otherwise the wide character could have been \ | |
231 | represented using a single byte. */ \ | |
232 | cnt = 2; \ | |
233 | ch &= 0x1f; \ | |
234 | } \ | |
a1ffb40e | 235 | else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \ |
f957edde AK |
236 | { \ |
237 | /* We expect three bytes. */ \ | |
238 | cnt = 3; \ | |
239 | ch &= 0x0f; \ | |
240 | } \ | |
a1ffb40e | 241 | else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \ |
f957edde AK |
242 | { \ |
243 | /* We expect four bytes. */ \ | |
244 | cnt = 4; \ | |
245 | ch &= 0x07; \ | |
246 | } \ | |
a1ffb40e | 247 | else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \ |
f957edde AK |
248 | { \ |
249 | /* We expect five bytes. */ \ | |
250 | cnt = 5; \ | |
251 | ch &= 0x03; \ | |
252 | } \ | |
a1ffb40e | 253 | else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \ |
f957edde AK |
254 | { \ |
255 | /* We expect six bytes. */ \ | |
256 | cnt = 6; \ | |
257 | ch &= 0x01; \ | |
258 | } \ | |
259 | else \ | |
260 | { \ | |
261 | /* Search the end of this ill-formed UTF-8 character. This \ | |
262 | is the next byte with (x & 0xc0) != 0x80. */ \ | |
263 | i = 0; \ | |
264 | do \ | |
265 | ++i; \ | |
266 | while (inptr + i < inend \ | |
267 | && (*(inptr + i) & 0xc0) == 0x80 \ | |
268 | && i < 5); \ | |
269 | \ | |
270 | errout: \ | |
271 | STANDARD_FROM_LOOP_ERR_HANDLER (i); \ | |
272 | } \ | |
273 | \ | |
a1ffb40e | 274 | if (__glibc_unlikely (inptr + cnt > inend)) \ |
f957edde AK |
275 | { \ |
276 | /* We don't have enough input. But before we report \ | |
277 | that check that all the bytes are correct. */ \ | |
278 | for (i = 1; inptr + i < inend; ++i) \ | |
279 | if ((inptr[i] & 0xc0) != 0x80) \ | |
280 | break; \ | |
281 | \ | |
a1ffb40e | 282 | if (__glibc_likely (inptr + i == inend)) \ |
f957edde AK |
283 | { \ |
284 | result = __GCONV_INCOMPLETE_INPUT; \ | |
285 | break; \ | |
286 | } \ | |
287 | \ | |
288 | goto errout; \ | |
289 | } \ | |
290 | \ | |
291 | /* Read the possible remaining bytes. */ \ | |
292 | for (i = 1; i < cnt; ++i) \ | |
293 | { \ | |
294 | uint32_t byte = inptr[i]; \ | |
295 | \ | |
296 | if ((byte & 0xc0) != 0x80) \ | |
297 | /* This is an illegal encoding. */ \ | |
298 | break; \ | |
299 | \ | |
300 | ch <<= 6; \ | |
301 | ch |= byte & 0x3f; \ | |
302 | } \ | |
303 | \ | |
304 | /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ | |
305 | If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ | |
306 | have been represented with fewer than cnt bytes. */ \ | |
307 | if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \ | |
308 | { \ | |
309 | /* This is an illegal encoding. */ \ | |
310 | goto errout; \ | |
311 | } \ | |
312 | \ | |
313 | inptr += cnt; \ | |
314 | } \ | |
315 | \ | |
316 | /* Now adjust the pointers and store the result. */ \ | |
317 | *((uint32_t *) outptr) = ch; \ | |
318 | outptr += sizeof (uint32_t); \ | |
319 | } | |
320 | #define LOOP_NEED_FLAGS | |
321 | ||
322 | #define STORE_REST \ | |
323 | { \ | |
324 | /* We store the remaining bytes while converting them into the UCS4 \ | |
325 | format. We can assume that the first byte in the buffer is \ | |
326 | correct and that it requires a larger number of bytes than there \ | |
327 | are in the input buffer. */ \ | |
328 | wint_t ch = **inptrp; \ | |
329 | size_t cnt, r; \ | |
330 | \ | |
331 | state->__count = inend - *inptrp; \ | |
332 | \ | |
333 | if (ch >= 0xc2 && ch < 0xe0) \ | |
334 | { \ | |
335 | /* We expect two bytes. The first byte cannot be 0xc0 or \ | |
336 | 0xc1, otherwise the wide character could have been \ | |
337 | represented using a single byte. */ \ | |
338 | cnt = 2; \ | |
339 | ch &= 0x1f; \ | |
340 | } \ | |
a1ffb40e | 341 | else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \ |
f957edde AK |
342 | { \ |
343 | /* We expect three bytes. */ \ | |
344 | cnt = 3; \ | |
345 | ch &= 0x0f; \ | |
346 | } \ | |
a1ffb40e | 347 | else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \ |
f957edde AK |
348 | { \ |
349 | /* We expect four bytes. */ \ | |
350 | cnt = 4; \ | |
351 | ch &= 0x07; \ | |
352 | } \ | |
a1ffb40e | 353 | else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \ |
f957edde AK |
354 | { \ |
355 | /* We expect five bytes. */ \ | |
356 | cnt = 5; \ | |
357 | ch &= 0x03; \ | |
358 | } \ | |
359 | else \ | |
360 | { \ | |
361 | /* We expect six bytes. */ \ | |
362 | cnt = 6; \ | |
363 | ch &= 0x01; \ | |
364 | } \ | |
365 | \ | |
366 | /* The first byte is already consumed. */ \ | |
367 | r = cnt - 1; \ | |
368 | while (++(*inptrp) < inend) \ | |
369 | { \ | |
370 | ch <<= 6; \ | |
371 | ch |= **inptrp & 0x3f; \ | |
372 | --r; \ | |
373 | } \ | |
374 | \ | |
375 | /* Shift for the so far missing bytes. */ \ | |
376 | ch <<= r * 6; \ | |
377 | \ | |
378 | /* Store the number of bytes expected for the entire sequence. */ \ | |
379 | state->__count |= cnt << 8; \ | |
380 | \ | |
381 | /* Store the value. */ \ | |
382 | state->__value.__wch = ch; \ | |
383 | } | |
384 | ||
385 | #define UNPACK_BYTES \ | |
386 | { \ | |
387 | static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ | |
388 | wint_t wch = state->__value.__wch; \ | |
389 | size_t ntotal = state->__count >> 8; \ | |
390 | \ | |
391 | inlen = state->__count & 255; \ | |
392 | \ | |
393 | bytebuf[0] = inmask[ntotal - 2]; \ | |
394 | \ | |
395 | do \ | |
396 | { \ | |
397 | if (--ntotal < inlen) \ | |
398 | bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ | |
399 | wch >>= 6; \ | |
400 | } \ | |
401 | while (ntotal > 1); \ | |
402 | \ | |
403 | bytebuf[0] |= wch; \ | |
404 | } | |
405 | ||
406 | #define CLEAR_STATE \ | |
407 | state->__count = 0 | |
408 | ||
409 | #include <iconv/loop.c> | |
410 | ||
411 | /* Conversion from UTF-32 internal/BE to UTF-8. */ | |
412 | ||
413 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO | |
414 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM | |
415 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM | |
416 | #define LOOPFCT TO_LOOP | |
417 | /* The software routine mimics the S/390 cu41 instruction. */ | |
418 | #define BODY \ | |
419 | { \ | |
420 | if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ | |
421 | { \ | |
422 | HARDWARE_CONVERT ("cu41 %0, %1"); \ | |
423 | \ | |
424 | if (inptr != inend) \ | |
425 | { \ | |
426 | result = __GCONV_INCOMPLETE_INPUT; \ | |
427 | break; \ | |
428 | } \ | |
429 | continue; \ | |
430 | } \ | |
431 | \ | |
432 | uint32_t wc = *((const uint32_t *) inptr); \ | |
433 | \ | |
a1ffb40e | 434 | if (__glibc_likely (wc <= 0x7f)) \ |
f957edde AK |
435 | { \ |
436 | /* Single UTF-8 char. */ \ | |
437 | *outptr = (uint8_t)wc; \ | |
438 | outptr++; \ | |
439 | } \ | |
440 | else if (wc <= 0x7ff) \ | |
441 | { \ | |
442 | /* Two UTF-8 chars. */ \ | |
a1ffb40e | 443 | if (__glibc_unlikely (outptr + 2 > outend)) \ |
f957edde AK |
444 | { \ |
445 | /* Overflow in the output buffer. */ \ | |
446 | result = __GCONV_FULL_OUTPUT; \ | |
447 | break; \ | |
448 | } \ | |
449 | \ | |
450 | outptr[0] = 0xc0; \ | |
451 | outptr[0] |= wc >> 6; \ | |
452 | \ | |
453 | outptr[1] = 0x80; \ | |
454 | outptr[1] |= wc & 0x3f; \ | |
455 | \ | |
456 | outptr += 2; \ | |
457 | } \ | |
458 | else if (wc <= 0xffff) \ | |
459 | { \ | |
460 | /* Three UTF-8 chars. */ \ | |
a1ffb40e | 461 | if (__glibc_unlikely (outptr + 3 > outend)) \ |
f957edde AK |
462 | { \ |
463 | /* Overflow in the output buffer. */ \ | |
464 | result = __GCONV_FULL_OUTPUT; \ | |
465 | break; \ | |
466 | } \ | |
467 | outptr[0] = 0xe0; \ | |
468 | outptr[0] |= wc >> 12; \ | |
469 | \ | |
470 | outptr[1] = 0x80; \ | |
471 | outptr[1] |= (wc >> 6) & 0x3f; \ | |
472 | \ | |
473 | outptr[2] = 0x80; \ | |
474 | outptr[2] |= wc & 0x3f; \ | |
475 | \ | |
476 | outptr += 3; \ | |
477 | } \ | |
478 | else if (wc <= 0x10ffff) \ | |
479 | { \ | |
480 | /* Four UTF-8 chars. */ \ | |
a1ffb40e | 481 | if (__glibc_unlikely (outptr + 4 > outend)) \ |
f957edde AK |
482 | { \ |
483 | /* Overflow in the output buffer. */ \ | |
484 | result = __GCONV_FULL_OUTPUT; \ | |
485 | break; \ | |
486 | } \ | |
487 | outptr[0] = 0xf0; \ | |
488 | outptr[0] |= wc >> 18; \ | |
489 | \ | |
490 | outptr[1] = 0x80; \ | |
491 | outptr[1] |= (wc >> 12) & 0x3f; \ | |
492 | \ | |
493 | outptr[2] = 0x80; \ | |
494 | outptr[2] |= (wc >> 6) & 0x3f; \ | |
495 | \ | |
496 | outptr[3] = 0x80; \ | |
497 | outptr[3] |= wc & 0x3f; \ | |
498 | \ | |
499 | outptr += 4; \ | |
500 | } \ | |
501 | else \ | |
502 | { \ | |
503 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ | |
504 | } \ | |
505 | inptr += 4; \ | |
506 | } | |
507 | #define LOOP_NEED_FLAGS | |
508 | #include <iconv/loop.c> | |
509 | ||
510 | #include <iconv/skeleton.c> |