]>
Commit | Line | Data |
---|---|---|
8d617a71 | 1 | /* Conversion module for Unicode |
d4697bc9 | 2 | Copyright (C) 1999-2014 Free Software Foundation, Inc. |
8d617a71 UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
8d617a71 UD |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 14 | Lesser General Public License for more details. |
8d617a71 | 15 | |
41bdb6e2 | 16 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
8d617a71 UD |
19 | |
20 | #include <byteswap.h> | |
55985355 | 21 | #include <dlfcn.h> |
8d617a71 UD |
22 | #include <gconv.h> |
23 | #include <stddef.h> | |
24 | #include <stdint.h> | |
25 | #include <stdlib.h> | |
26 | #include <string.h> | |
27 | ||
28 | /* This is the Byte Order Mark character (BOM). */ | |
29 | #define BOM 0xfeff | |
30 | /* And in the other endian format. */ | |
31 | #define BOM_OE 0xfffe | |
32 | ||
33 | ||
34 | /* Definitions used in the body of the `gconv' function. */ | |
35 | #define FROM_LOOP from_unicode_loop | |
36 | #define TO_LOOP to_unicode_loop | |
37 | #define DEFINE_INIT 0 | |
38 | #define DEFINE_FINI 0 | |
39 | #define MIN_NEEDED_FROM 2 | |
40 | #define MIN_NEEDED_TO 4 | |
41 | #define FROM_DIRECTION (dir == from_unicode) | |
42 | #define PREPARE_LOOP \ | |
43 | enum direction dir = ((struct unicode_data *) step->__data)->dir; \ | |
44 | int swap; \ | |
45 | if (FROM_DIRECTION) \ | |
46 | { \ | |
47 | if (data->__invocation_counter == 0) \ | |
48 | { \ | |
49 | /* We have to find out which byte order the file is encoded in. */ \ | |
fd1b5c0f | 50 | if (inptr + 2 > inend) \ |
eb9dc2a2 UD |
51 | return (inptr == inend \ |
52 | ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); \ | |
8d617a71 | 53 | \ |
77e1d15a | 54 | if (get16u (inptr) == BOM) \ |
8d617a71 | 55 | /* Simply ignore the BOM character. */ \ |
b721a2c0 | 56 | *inptrp = inptr += 2; \ |
77e1d15a | 57 | else if (get16u (inptr) == BOM_OE) \ |
8d617a71 | 58 | { \ |
ee190f67 | 59 | data->__flags |= __GCONV_SWAP; \ |
b721a2c0 | 60 | *inptrp = inptr += 2; \ |
8d617a71 UD |
61 | } \ |
62 | } \ | |
63 | } \ | |
64 | else if (!data->__internal_use && data->__invocation_counter == 0) \ | |
65 | { \ | |
66 | /* Emit the Byte Order Mark. */ \ | |
ee4ce289 | 67 | if (__builtin_expect (outbuf + 2 > outend, 0)) \ |
8d617a71 UD |
68 | return __GCONV_FULL_OUTPUT; \ |
69 | \ | |
77e1d15a | 70 | put16u (outbuf, BOM); \ |
8d617a71 UD |
71 | outbuf += 2; \ |
72 | } \ | |
ee190f67 | 73 | swap = data->__flags & __GCONV_SWAP; |
55985355 | 74 | #define EXTRA_LOOP_ARGS , swap |
8d617a71 UD |
75 | |
76 | ||
77 | /* Direction of the transformation. */ | |
78 | enum direction | |
79 | { | |
80 | illegal_dir, | |
81 | to_unicode, | |
82 | from_unicode | |
83 | }; | |
84 | ||
85 | struct unicode_data | |
86 | { | |
87 | enum direction dir; | |
8d617a71 UD |
88 | }; |
89 | ||
90 | ||
8c0b7170 | 91 | extern int gconv_init (struct __gconv_step *step); |
8d617a71 UD |
92 | int |
93 | gconv_init (struct __gconv_step *step) | |
94 | { | |
95 | /* Determine which direction. */ | |
96 | struct unicode_data *new_data; | |
97 | enum direction dir = illegal_dir; | |
98 | int result; | |
99 | ||
a8f6cd90 | 100 | if (strcmp (step->__from_name, "UNICODE//") == 0) |
8d617a71 UD |
101 | dir = from_unicode; |
102 | else | |
103 | dir = to_unicode; | |
104 | ||
105 | new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data)); | |
106 | ||
107 | result = __GCONV_NOMEM; | |
108 | if (new_data != NULL) | |
109 | { | |
110 | new_data->dir = dir; | |
8d617a71 UD |
111 | step->__data = new_data; |
112 | ||
113 | if (dir == from_unicode) | |
114 | { | |
115 | step->__min_needed_from = MIN_NEEDED_FROM; | |
116 | step->__max_needed_from = MIN_NEEDED_FROM; | |
117 | step->__min_needed_to = MIN_NEEDED_TO; | |
118 | step->__max_needed_to = MIN_NEEDED_TO; | |
119 | } | |
120 | else | |
121 | { | |
122 | step->__min_needed_from = MIN_NEEDED_TO; | |
123 | step->__max_needed_from = MIN_NEEDED_TO; | |
124 | step->__min_needed_to = MIN_NEEDED_FROM; | |
125 | step->__max_needed_to = MIN_NEEDED_FROM; | |
126 | } | |
127 | ||
128 | step->__stateful = 0; | |
129 | ||
130 | result = __GCONV_OK; | |
131 | } | |
132 | ||
133 | return result; | |
134 | } | |
135 | ||
136 | ||
8c0b7170 | 137 | extern void gconv_end (struct __gconv_step *data); |
8d617a71 UD |
138 | void |
139 | gconv_end (struct __gconv_step *data) | |
140 | { | |
141 | free (data->__data); | |
142 | } | |
143 | ||
144 | ||
145 | /* Convert from the internal (UCS4-like) format to UCS2. */ | |
146 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO | |
147 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM | |
148 | #define LOOPFCT TO_LOOP | |
149 | #define BODY \ | |
150 | { \ | |
77e1d15a | 151 | uint32_t c = get32 (inptr); \ |
8d617a71 | 152 | \ |
db2d05f9 | 153 | if (__builtin_expect (c >= 0x10000, 0)) \ |
8d617a71 | 154 | { \ |
601d2942 | 155 | UNICODE_TAG_HANDLER (c, 4); \ |
e438a468 | 156 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
85830c4c | 157 | } \ |
755104ed UD |
158 | else if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0)) \ |
159 | { \ | |
160 | /* Surrogate characters in UCS-4 input are not valid. \ | |
161 | We must catch this, because the UCS-2 output might be \ | |
162 | interpreted as UTF-16 by other programs. If we let \ | |
163 | surrogates pass through, attackers could make a security \ | |
164 | hole exploit by synthesizing any desired plane 1-16 \ | |
165 | character. */ \ | |
e438a468 | 166 | result = __GCONV_ILLEGAL_INPUT; \ |
755104ed | 167 | if (! ignore_errors_p ()) \ |
e438a468 | 168 | break; \ |
755104ed UD |
169 | inptr += 4; \ |
170 | ++*irreversible; \ | |
171 | continue; \ | |
172 | } \ | |
85830c4c UD |
173 | else \ |
174 | { \ | |
175 | put16 (outptr, c); \ | |
176 | outptr += 2; \ | |
177 | } \ | |
8d617a71 | 178 | \ |
8d617a71 UD |
179 | inptr += 4; \ |
180 | } | |
55985355 | 181 | #define LOOP_NEED_FLAGS |
8d617a71 | 182 | #define EXTRA_LOOP_DECLS \ |
55985355 | 183 | , int swap |
8d617a71 UD |
184 | #include <iconv/loop.c> |
185 | ||
186 | ||
187 | /* Convert from UCS2 to the internal (UCS4-like) format. */ | |
188 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
189 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
190 | #define LOOPFCT FROM_LOOP | |
191 | #define BODY \ | |
192 | { \ | |
77e1d15a | 193 | uint16_t u1 = get16 (inptr); \ |
8d617a71 UD |
194 | \ |
195 | if (swap) \ | |
196 | u1 = bswap_16 (u1); \ | |
197 | \ | |
755104ed UD |
198 | if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \ |
199 | { \ | |
200 | /* Surrogate characters in UCS-2 input are not valid. Reject \ | |
201 | them. (Catching this here is not security relevant.) */ \ | |
e438a468 | 202 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ |
755104ed UD |
203 | } \ |
204 | \ | |
77e1d15a | 205 | put32 (outptr, u1); \ |
8d617a71 UD |
206 | \ |
207 | inptr += 2; \ | |
208 | outptr += 4; \ | |
209 | } | |
755104ed | 210 | #define LOOP_NEED_FLAGS |
8d617a71 | 211 | #define EXTRA_LOOP_DECLS \ |
55985355 | 212 | , int swap |
8d617a71 UD |
213 | #include <iconv/loop.c> |
214 | ||
215 | ||
216 | /* Now define the toplevel functions. */ | |
217 | #include <iconv/skeleton.c> |