]>
Commit | Line | Data |
---|---|---|
f957edde AK |
1 | /* Conversion between UTF-16 and UTF-32 BE/internal. |
2 | ||
3 | This module uses the Z9-109 variants of the Convert Unicode | |
4 | instructions. | |
d4697bc9 | 5 | Copyright (C) 1997-2014 Free Software Foundation, Inc. |
f957edde AK |
6 | |
7 | Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com> | |
8 | Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997. | |
9 | ||
10 | Thanks to Daniel Appich who covered the relevant performance work | |
11 | in his diploma thesis. | |
12 | ||
13 | This is free software; you can redistribute it and/or | |
14 | modify it under the terms of the GNU Lesser General Public | |
15 | License as published by the Free Software Foundation; either | |
16 | version 2.1 of the License, or (at your option) any later version. | |
17 | ||
18 | This is distributed in the hope that it will be useful, | |
19 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 | Lesser General Public License for more details. | |
22 | ||
23 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
24 | License along with the GNU C Library; if not, see |
25 | <http://www.gnu.org/licenses/>. */ | |
f957edde AK |
26 | |
27 | #include <dlfcn.h> | |
28 | #include <stdint.h> | |
29 | #include <unistd.h> | |
30 | #include <dl-procinfo.h> | |
31 | #include <gconv.h> | |
32 | ||
33 | /* UTF-32 big endian byte order mark. */ | |
34 | #define BOM_UTF32 0x0000feffu | |
35 | ||
36 | /* UTF-16 big endian byte order mark. */ | |
37 | #define BOM_UTF16 0xfeff | |
38 | ||
39 | #define DEFINE_INIT 0 | |
40 | #define DEFINE_FINI 0 | |
41 | #define MIN_NEEDED_FROM 2 | |
42 | #define MAX_NEEDED_FROM 4 | |
43 | #define MIN_NEEDED_TO 4 | |
44 | #define FROM_LOOP from_utf16_loop | |
45 | #define TO_LOOP to_utf16_loop | |
46 | #define FROM_DIRECTION (dir == from_utf16) | |
f349489e | 47 | #define ONE_DIRECTION 0 |
f957edde AK |
48 | #define PREPARE_LOOP \ |
49 | enum direction dir = ((struct utf16_data *) step->__data)->dir; \ | |
50 | int emit_bom = ((struct utf16_data *) step->__data)->emit_bom; \ | |
51 | \ | |
52 | if (emit_bom && !data->__internal_use \ | |
53 | && data->__invocation_counter == 0) \ | |
54 | { \ | |
55 | if (dir == to_utf16) \ | |
56 | { \ | |
57 | /* Emit the UTF-16 Byte Order Mark. */ \ | |
a1ffb40e | 58 | if (__glibc_unlikely (outbuf + 2 > outend)) \ |
f957edde AK |
59 | return __GCONV_FULL_OUTPUT; \ |
60 | \ | |
61 | put16u (outbuf, BOM_UTF16); \ | |
62 | outbuf += 2; \ | |
63 | } \ | |
64 | else \ | |
65 | { \ | |
66 | /* Emit the UTF-32 Byte Order Mark. */ \ | |
a1ffb40e | 67 | if (__glibc_unlikely (outbuf + 4 > outend)) \ |
f957edde AK |
68 | return __GCONV_FULL_OUTPUT; \ |
69 | \ | |
70 | put32u (outbuf, BOM_UTF32); \ | |
71 | outbuf += 4; \ | |
72 | } \ | |
73 | } | |
74 | ||
75 | /* Direction of the transformation. */ | |
76 | enum direction | |
77 | { | |
78 | illegal_dir, | |
79 | to_utf16, | |
80 | from_utf16 | |
81 | }; | |
82 | ||
83 | struct utf16_data | |
84 | { | |
85 | enum direction dir; | |
86 | int emit_bom; | |
87 | }; | |
88 | ||
89 | ||
90 | extern int gconv_init (struct __gconv_step *step); | |
91 | int | |
92 | gconv_init (struct __gconv_step *step) | |
93 | { | |
94 | /* Determine which direction. */ | |
95 | struct utf16_data *new_data; | |
96 | enum direction dir = illegal_dir; | |
97 | int emit_bom; | |
98 | int result; | |
99 | ||
100 | emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0 | |
101 | || __strcasecmp (step->__to_name, "UTF-16//") == 0); | |
102 | ||
103 | if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0 | |
104 | && (__strcasecmp (step->__to_name, "UTF-32//") == 0 | |
105 | || __strcasecmp (step->__to_name, "UTF-32BE//") == 0 | |
89749d19 | 106 | || __strcasecmp (step->__to_name, "INTERNAL") == 0)) |
f957edde AK |
107 | { |
108 | dir = from_utf16; | |
109 | } | |
110 | else if ((__strcasecmp (step->__to_name, "UTF-16//") == 0 | |
111 | || __strcasecmp (step->__to_name, "UTF-16BE//") == 0) | |
112 | && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0 | |
113 | || __strcasecmp (step->__from_name, "INTERNAL") == 0)) | |
114 | { | |
115 | dir = to_utf16; | |
116 | } | |
117 | ||
118 | result = __GCONV_NOCONV; | |
119 | if (dir != illegal_dir) | |
120 | { | |
121 | new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data)); | |
122 | ||
123 | result = __GCONV_NOMEM; | |
124 | if (new_data != NULL) | |
125 | { | |
126 | new_data->dir = dir; | |
127 | new_data->emit_bom = emit_bom; | |
128 | step->__data = new_data; | |
129 | ||
130 | if (dir == from_utf16) | |
131 | { | |
132 | step->__min_needed_from = MIN_NEEDED_FROM; | |
133 | step->__max_needed_from = MIN_NEEDED_FROM; | |
134 | step->__min_needed_to = MIN_NEEDED_TO; | |
135 | step->__max_needed_to = MIN_NEEDED_TO; | |
136 | } | |
137 | else | |
138 | { | |
139 | step->__min_needed_from = MIN_NEEDED_TO; | |
140 | step->__max_needed_from = MIN_NEEDED_TO; | |
141 | step->__min_needed_to = MIN_NEEDED_FROM; | |
142 | step->__max_needed_to = MIN_NEEDED_FROM; | |
143 | } | |
144 | ||
145 | step->__stateful = 0; | |
146 | ||
147 | result = __GCONV_OK; | |
148 | } | |
149 | } | |
150 | ||
151 | return result; | |
152 | } | |
153 | ||
154 | ||
155 | extern void gconv_end (struct __gconv_step *data); | |
156 | void | |
157 | gconv_end (struct __gconv_step *data) | |
158 | { | |
159 | free (data->__data); | |
160 | } | |
161 | ||
162 | /* The macro for the hardware loop. This is used for both | |
163 | directions. */ | |
164 | #define HARDWARE_CONVERT(INSTRUCTION) \ | |
165 | { \ | |
166 | register const unsigned char* pInput asm ("8") = inptr; \ | |
167 | register unsigned long long inlen asm ("9") = inend - inptr; \ | |
168 | register unsigned char* pOutput asm ("10") = outptr; \ | |
169 | register unsigned long long outlen asm("11") = outend - outptr; \ | |
170 | uint64_t cc = 0; \ | |
171 | \ | |
27390476 AK |
172 | asm volatile (".machine push \n\t" \ |
173 | ".machine \"z9-109\" \n\t" \ | |
174 | "0: " INSTRUCTION " \n\t" \ | |
175 | ".machine pop \n\t" \ | |
f957edde AK |
176 | " jo 0b \n\t" \ |
177 | " ipm %2 \n" \ | |
178 | : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ | |
179 | "+d" (outlen), "+d" (inlen) \ | |
180 | : \ | |
181 | : "cc", "memory"); \ | |
182 | \ | |
183 | inptr = pInput; \ | |
184 | outptr = pOutput; \ | |
89749d19 | 185 | cc >>= 28; \ |
f957edde AK |
186 | \ |
187 | if (cc == 1) \ | |
188 | { \ | |
189 | result = __GCONV_FULL_OUTPUT; \ | |
190 | break; \ | |
191 | } \ | |
192 | else if (cc == 2) \ | |
193 | { \ | |
194 | result = __GCONV_ILLEGAL_INPUT; \ | |
195 | break; \ | |
196 | } \ | |
197 | } | |
198 | ||
199 | /* Conversion function from UTF-16 to UTF-32 internal/BE. */ | |
200 | ||
201 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
202 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM | |
203 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
204 | #define LOOPFCT FROM_LOOP | |
205 | /* The software routine is copied from utf-16.c (minus bytes | |
206 | swapping). */ | |
207 | #define BODY \ | |
208 | { \ | |
a3dc4658 AK |
209 | /* The hardware instruction currently fails to report an error for \ |
210 | isolated low surrogates so we have to disable the instruction \ | |
211 | until this gets resolved. */ \ | |
212 | if (0) /* (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) */ \ | |
f957edde AK |
213 | { \ |
214 | HARDWARE_CONVERT ("cu24 %0, %1, 1"); \ | |
215 | if (inptr != inend) \ | |
216 | { \ | |
217 | /* Check if the third byte is \ | |
218 | a valid start of a UTF-16 surrogate. */ \ | |
219 | if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \ | |
220 | STANDARD_FROM_LOOP_ERR_HANDLER (3); \ | |
221 | \ | |
222 | result = __GCONV_INCOMPLETE_INPUT; \ | |
223 | break; \ | |
224 | } \ | |
225 | continue; \ | |
226 | } \ | |
227 | \ | |
228 | uint16_t u1 = get16 (inptr); \ | |
229 | \ | |
230 | if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff) \ | |
231 | { \ | |
232 | /* No surrogate. */ \ | |
233 | put32 (outptr, u1); \ | |
234 | inptr += 2; \ | |
235 | } \ | |
236 | else \ | |
237 | { \ | |
a3dc4658 AK |
238 | /* An isolated low-surrogate was found. This has to be \ |
239 | considered ill-formed. */ \ | |
a1ffb40e | 240 | if (__glibc_unlikely (u1 >= 0xdc00)) \ |
a3dc4658 AK |
241 | { \ |
242 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ | |
243 | } \ | |
f957edde AK |
244 | /* It's a surrogate character. At least the first word says \ |
245 | it is. */ \ | |
a1ffb40e | 246 | if (__glibc_unlikely (inptr + 4 > inend)) \ |
f957edde AK |
247 | { \ |
248 | /* We don't have enough input for another complete input \ | |
249 | character. */ \ | |
250 | result = __GCONV_INCOMPLETE_INPUT; \ | |
251 | break; \ | |
252 | } \ | |
253 | \ | |
254 | inptr += 2; \ | |
255 | uint16_t u2 = get16 (inptr); \ | |
256 | if (__builtin_expect (u2 < 0xdc00, 0) \ | |
257 | || __builtin_expect (u2 > 0xdfff, 0)) \ | |
258 | { \ | |
259 | /* This is no valid second word for a surrogate. */ \ | |
260 | inptr -= 2; \ | |
261 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ | |
262 | } \ | |
263 | \ | |
264 | put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00)); \ | |
265 | inptr += 2; \ | |
266 | } \ | |
267 | outptr += 4; \ | |
268 | } | |
269 | #define LOOP_NEED_FLAGS | |
270 | #include <iconv/loop.c> | |
271 | ||
272 | /* Conversion from UTF-32 internal/BE to UTF-16. */ | |
273 | ||
274 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO | |
275 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM | |
276 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM | |
277 | #define LOOPFCT TO_LOOP | |
278 | /* The software routine is copied from utf-16.c (minus bytes | |
279 | swapping). */ | |
280 | #define BODY \ | |
281 | { \ | |
282 | if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ | |
283 | { \ | |
284 | HARDWARE_CONVERT ("cu42 %0, %1"); \ | |
285 | \ | |
286 | if (inptr != inend) \ | |
287 | { \ | |
288 | result = __GCONV_INCOMPLETE_INPUT; \ | |
289 | break; \ | |
290 | } \ | |
291 | continue; \ | |
292 | } \ | |
293 | \ | |
294 | uint32_t c = get32 (inptr); \ | |
295 | \ | |
296 | if (__builtin_expect (c <= 0xd7ff, 1) \ | |
297 | || (c >=0xdc00 && c <= 0xffff)) \ | |
298 | { \ | |
299 | /* Two UTF-16 chars. */ \ | |
300 | put16 (outptr, c); \ | |
301 | } \ | |
302 | else if (__builtin_expect (c >= 0x10000, 1) \ | |
303 | && __builtin_expect (c <= 0x10ffff, 1)) \ | |
304 | { \ | |
305 | /* Four UTF-16 chars. */ \ | |
306 | uint16_t zabcd = ((c & 0x1f0000) >> 16) - 1; \ | |
307 | uint16_t out; \ | |
308 | \ | |
309 | /* Generate a surrogate character. */ \ | |
a1ffb40e | 310 | if (__glibc_unlikely (outptr + 4 > outend)) \ |
f957edde AK |
311 | { \ |
312 | /* Overflow in the output buffer. */ \ | |
313 | result = __GCONV_FULL_OUTPUT; \ | |
314 | break; \ | |
315 | } \ | |
316 | \ | |
317 | out = 0xd800; \ | |
318 | out |= (zabcd & 0xff) << 6; \ | |
319 | out |= (c >> 10) & 0x3f; \ | |
320 | put16 (outptr, out); \ | |
321 | outptr += 2; \ | |
322 | \ | |
323 | out = 0xdc00; \ | |
324 | out |= c & 0x3ff; \ | |
325 | put16 (outptr, out); \ | |
326 | } \ | |
327 | else \ | |
328 | { \ | |
329 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ | |
330 | } \ | |
331 | outptr += 2; \ | |
332 | inptr += 4; \ | |
333 | } | |
334 | #define LOOP_NEED_FLAGS | |
335 | #include <iconv/loop.c> | |
336 | ||
337 | #include <iconv/skeleton.c> |