]>
Commit | Line | Data |
---|---|---|
8d617a71 | 1 | /* Conversion module for Unicode |
d614a753 | 2 | Copyright (C) 1999-2020 Free Software Foundation, Inc. |
8d617a71 UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
8d617a71 UD |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 14 | Lesser General Public License for more details. |
8d617a71 | 15 | |
41bdb6e2 | 16 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 | 17 | License along with the GNU C Library; if not, see |
5a82c748 | 18 | <https://www.gnu.org/licenses/>. */ |
8d617a71 UD |
19 | |
20 | #include <byteswap.h> | |
55985355 | 21 | #include <dlfcn.h> |
8d617a71 UD |
22 | #include <gconv.h> |
23 | #include <stddef.h> | |
24 | #include <stdint.h> | |
25 | #include <stdlib.h> | |
26 | #include <string.h> | |
27 | ||
28 | /* This is the Byte Order Mark character (BOM). */ | |
29 | #define BOM 0xfeff | |
30 | /* And in the other endian format. */ | |
31 | #define BOM_OE 0xfffe | |
32 | ||
33 | ||
34 | /* Definitions used in the body of the `gconv' function. */ | |
35 | #define FROM_LOOP from_unicode_loop | |
36 | #define TO_LOOP to_unicode_loop | |
37 | #define DEFINE_INIT 0 | |
38 | #define DEFINE_FINI 0 | |
39 | #define MIN_NEEDED_FROM 2 | |
40 | #define MIN_NEEDED_TO 4 | |
13e402e7 | 41 | #define ONE_DIRECTION 0 |
8d617a71 UD |
42 | #define FROM_DIRECTION (dir == from_unicode) |
43 | #define PREPARE_LOOP \ | |
44 | enum direction dir = ((struct unicode_data *) step->__data)->dir; \ | |
45 | int swap; \ | |
46 | if (FROM_DIRECTION) \ | |
47 | { \ | |
48 | if (data->__invocation_counter == 0) \ | |
49 | { \ | |
50 | /* We have to find out which byte order the file is encoded in. */ \ | |
fd1b5c0f | 51 | if (inptr + 2 > inend) \ |
eb9dc2a2 UD |
52 | return (inptr == inend \ |
53 | ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); \ | |
8d617a71 | 54 | \ |
77e1d15a | 55 | if (get16u (inptr) == BOM) \ |
8d617a71 | 56 | /* Simply ignore the BOM character. */ \ |
b721a2c0 | 57 | *inptrp = inptr += 2; \ |
77e1d15a | 58 | else if (get16u (inptr) == BOM_OE) \ |
8d617a71 | 59 | { \ |
ee190f67 | 60 | data->__flags |= __GCONV_SWAP; \ |
b721a2c0 | 61 | *inptrp = inptr += 2; \ |
8d617a71 UD |
62 | } \ |
63 | } \ | |
64 | } \ | |
65 | else if (!data->__internal_use && data->__invocation_counter == 0) \ | |
66 | { \ | |
67 | /* Emit the Byte Order Mark. */ \ | |
a1ffb40e | 68 | if (__glibc_unlikely (outbuf + 2 > outend)) \ |
8d617a71 UD |
69 | return __GCONV_FULL_OUTPUT; \ |
70 | \ | |
77e1d15a | 71 | put16u (outbuf, BOM); \ |
8d617a71 UD |
72 | outbuf += 2; \ |
73 | } \ | |
ee190f67 | 74 | swap = data->__flags & __GCONV_SWAP; |
55985355 | 75 | #define EXTRA_LOOP_ARGS , swap |
8d617a71 UD |
76 | |
77 | ||
78 | /* Direction of the transformation. */ | |
79 | enum direction | |
80 | { | |
81 | illegal_dir, | |
82 | to_unicode, | |
83 | from_unicode | |
84 | }; | |
85 | ||
86 | struct unicode_data | |
87 | { | |
88 | enum direction dir; | |
8d617a71 UD |
89 | }; |
90 | ||
91 | ||
8c0b7170 | 92 | extern int gconv_init (struct __gconv_step *step); |
8d617a71 UD |
93 | int |
94 | gconv_init (struct __gconv_step *step) | |
95 | { | |
96 | /* Determine which direction. */ | |
97 | struct unicode_data *new_data; | |
98 | enum direction dir = illegal_dir; | |
99 | int result; | |
100 | ||
a8f6cd90 | 101 | if (strcmp (step->__from_name, "UNICODE//") == 0) |
8d617a71 UD |
102 | dir = from_unicode; |
103 | else | |
104 | dir = to_unicode; | |
105 | ||
106 | new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data)); | |
107 | ||
108 | result = __GCONV_NOMEM; | |
109 | if (new_data != NULL) | |
110 | { | |
111 | new_data->dir = dir; | |
8d617a71 UD |
112 | step->__data = new_data; |
113 | ||
114 | if (dir == from_unicode) | |
115 | { | |
116 | step->__min_needed_from = MIN_NEEDED_FROM; | |
117 | step->__max_needed_from = MIN_NEEDED_FROM; | |
118 | step->__min_needed_to = MIN_NEEDED_TO; | |
119 | step->__max_needed_to = MIN_NEEDED_TO; | |
120 | } | |
121 | else | |
122 | { | |
123 | step->__min_needed_from = MIN_NEEDED_TO; | |
124 | step->__max_needed_from = MIN_NEEDED_TO; | |
125 | step->__min_needed_to = MIN_NEEDED_FROM; | |
126 | step->__max_needed_to = MIN_NEEDED_FROM; | |
127 | } | |
128 | ||
129 | step->__stateful = 0; | |
130 | ||
131 | result = __GCONV_OK; | |
132 | } | |
133 | ||
134 | return result; | |
135 | } | |
136 | ||
137 | ||
8c0b7170 | 138 | extern void gconv_end (struct __gconv_step *data); |
8d617a71 UD |
139 | void |
140 | gconv_end (struct __gconv_step *data) | |
141 | { | |
142 | free (data->__data); | |
143 | } | |
144 | ||
145 | ||
146 | /* Convert from the internal (UCS4-like) format to UCS2. */ | |
147 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO | |
148 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM | |
149 | #define LOOPFCT TO_LOOP | |
150 | #define BODY \ | |
151 | { \ | |
77e1d15a | 152 | uint32_t c = get32 (inptr); \ |
8d617a71 | 153 | \ |
a1ffb40e | 154 | if (__glibc_unlikely (c >= 0x10000)) \ |
8d617a71 | 155 | { \ |
601d2942 | 156 | UNICODE_TAG_HANDLER (c, 4); \ |
e438a468 | 157 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
85830c4c | 158 | } \ |
a1ffb40e | 159 | else if (__glibc_unlikely (c >= 0xd800 && c < 0xe000)) \ |
755104ed UD |
160 | { \ |
161 | /* Surrogate characters in UCS-4 input are not valid. \ | |
162 | We must catch this, because the UCS-2 output might be \ | |
163 | interpreted as UTF-16 by other programs. If we let \ | |
164 | surrogates pass through, attackers could make a security \ | |
165 | hole exploit by synthesizing any desired plane 1-16 \ | |
166 | character. */ \ | |
e438a468 | 167 | result = __GCONV_ILLEGAL_INPUT; \ |
755104ed | 168 | if (! ignore_errors_p ()) \ |
e438a468 | 169 | break; \ |
755104ed UD |
170 | inptr += 4; \ |
171 | ++*irreversible; \ | |
172 | continue; \ | |
173 | } \ | |
85830c4c UD |
174 | else \ |
175 | { \ | |
176 | put16 (outptr, c); \ | |
177 | outptr += 2; \ | |
178 | } \ | |
8d617a71 | 179 | \ |
8d617a71 UD |
180 | inptr += 4; \ |
181 | } | |
55985355 | 182 | #define LOOP_NEED_FLAGS |
8d617a71 | 183 | #define EXTRA_LOOP_DECLS \ |
55985355 | 184 | , int swap |
8d617a71 UD |
185 | #include <iconv/loop.c> |
186 | ||
187 | ||
188 | /* Convert from UCS2 to the internal (UCS4-like) format. */ | |
189 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM | |
190 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO | |
191 | #define LOOPFCT FROM_LOOP | |
192 | #define BODY \ | |
193 | { \ | |
77e1d15a | 194 | uint16_t u1 = get16 (inptr); \ |
8d617a71 UD |
195 | \ |
196 | if (swap) \ | |
197 | u1 = bswap_16 (u1); \ | |
198 | \ | |
a1ffb40e | 199 | if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \ |
755104ed UD |
200 | { \ |
201 | /* Surrogate characters in UCS-2 input are not valid. Reject \ | |
202 | them. (Catching this here is not security relevant.) */ \ | |
e438a468 | 203 | STANDARD_FROM_LOOP_ERR_HANDLER (2); \ |
755104ed UD |
204 | } \ |
205 | \ | |
77e1d15a | 206 | put32 (outptr, u1); \ |
8d617a71 UD |
207 | \ |
208 | inptr += 2; \ | |
209 | outptr += 4; \ | |
210 | } | |
755104ed | 211 | #define LOOP_NEED_FLAGS |
8d617a71 | 212 | #define EXTRA_LOOP_DECLS \ |
55985355 | 213 | , int swap |
8d617a71 UD |
214 | #include <iconv/loop.c> |
215 | ||
216 | ||
217 | /* Now define the toplevel functions. */ | |
218 | #include <iconv/skeleton.c> |