]>
Commit | Line | Data |
---|---|---|
d4697bc9 | 1 | /* Copyright (C) 1995-2014 Free Software Foundation, Inc. |
c84142e8 | 2 | This file is part of the GNU C Library. |
1ab62b32 | 3 | Written by Ulrich Drepper <drepper@gnu.org>, 1995. |
c84142e8 UD |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
c84142e8 UD |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 13 | Lesser General Public License for more details. |
c84142e8 | 14 | |
41bdb6e2 | 15 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
c84142e8 | 18 | |
ccadf7b5 UD |
19 | #include <assert.h> |
20 | #include <langinfo.h> | |
21 | #include <locale.h> | |
22 | #include <stddef.h> | |
23 | #include <stdint.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include <sys/param.h> | |
1ab62b32 | 27 | |
ccadf7b5 UD |
28 | #ifndef STRING_TYPE |
29 | # define STRING_TYPE char | |
30 | # define USTRING_TYPE unsigned char | |
31 | # define STRXFRM __strxfrm_l | |
32 | # define STRCMP strcmp | |
33 | # define STRLEN strlen | |
34 | # define STPNCPY __stpncpy | |
35 | # define WEIGHT_H "../locale/weight.h" | |
36 | # define SUFFIX MB | |
37 | # define L(arg) arg | |
38 | #endif | |
39 | ||
40 | #define CONCAT(a,b) CONCAT1(a,b) | |
41 | #define CONCAT1(a,b) a##b | |
42 | ||
43 | #include "../locale/localeinfo.h" | |
44 | ||
45 | ||
46 | #ifndef WIDE_CHAR_VERSION | |
47 | ||
48 | /* We need UTF-8 encoding of numbers. */ | |
49 | static int | |
50 | utf8_encode (char *buf, int val) | |
51 | { | |
52 | int retval; | |
53 | ||
54 | if (val < 0x80) | |
55 | { | |
56 | *buf++ = (char) val; | |
57 | retval = 1; | |
58 | } | |
59 | else | |
60 | { | |
61 | int step; | |
62 | ||
63 | for (step = 2; step < 6; ++step) | |
64 | if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0) | |
65 | break; | |
66 | retval = step; | |
67 | ||
68 | *buf = (unsigned char) (~0xff >> step); | |
69 | --step; | |
70 | do | |
71 | { | |
72 | buf[step] = 0x80 | (val & 0x3f); | |
73 | val >>= 6; | |
74 | } | |
75 | while (--step > 0); | |
76 | *buf |= val; | |
77 | } | |
78 | ||
79 | return retval; | |
80 | } | |
81 | #endif | |
82 | ||
83 | ||
84 | size_t | |
85 | STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l) | |
86 | { | |
f095bb72 | 87 | struct __locale_data *current = l->__locales[LC_COLLATE]; |
ccadf7b5 UD |
88 | uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word; |
89 | /* We don't assign the following values right away since it might be | |
90 | unnecessary in case there are no rules. */ | |
91 | const unsigned char *rulesets; | |
92 | const int32_t *table; | |
93 | const USTRING_TYPE *weights; | |
94 | const USTRING_TYPE *extra; | |
95 | const int32_t *indirect; | |
96 | uint_fast32_t pass; | |
97 | size_t needed; | |
2f334ad5 | 98 | size_t last_needed; |
ccadf7b5 UD |
99 | const USTRING_TYPE *usrc; |
100 | size_t srclen = STRLEN (src); | |
101 | int32_t *idxarr; | |
102 | unsigned char *rulearr; | |
103 | size_t idxmax; | |
104 | size_t idxcnt; | |
105 | int use_malloc; | |
106 | ||
107 | #include WEIGHT_H | |
108 | ||
109 | if (nrules == 0) | |
110 | { | |
111 | if (n != 0) | |
112 | STPNCPY (dest, src, MIN (srclen + 1, n)); | |
113 | ||
114 | return srclen; | |
115 | } | |
116 | ||
117 | rulesets = (const unsigned char *) | |
118 | current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string; | |
119 | table = (const int32_t *) | |
120 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string; | |
121 | weights = (const USTRING_TYPE *) | |
122 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string; | |
123 | extra = (const USTRING_TYPE *) | |
124 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string; | |
125 | indirect = (const int32_t *) | |
126 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string; | |
127 | use_malloc = 0; | |
128 | ||
129 | assert (((uintptr_t) table) % __alignof__ (table[0]) == 0); | |
130 | assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0); | |
131 | assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0); | |
132 | assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0); | |
133 | ||
134 | /* Handle an empty string as a special case. */ | |
135 | if (srclen == 0) | |
136 | { | |
137 | if (n != 0) | |
f095bb72 | 138 | *dest = L('\0'); |
ccadf7b5 UD |
139 | return 0; |
140 | } | |
141 | ||
142 | /* We need the elements of the string as unsigned values since they | |
143 | are used as indeces. */ | |
144 | usrc = (const USTRING_TYPE *) src; | |
145 | ||
146 | /* Perform the first pass over the string and while doing this find | |
147 | and store the weights for each character. Since we want this to | |
148 | be as fast as possible we are using `alloca' to store the temporary | |
149 | values. But since there is no limit on the length of the string | |
150 | we have to use `malloc' if the string is too long. We should be | |
151 | very conservative here. */ | |
e0e72284 | 152 | if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1))) |
ccadf7b5 UD |
153 | { |
154 | idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1)); | |
155 | rulearr = (unsigned char *) &idxarr[srclen]; | |
156 | ||
157 | if (idxarr == NULL) | |
158 | /* No memory. Well, go with the stack then. | |
159 | ||
160 | XXX Once this implementation is stable we will handle this | |
161 | differently. Instead of precomputing the indeces we will | |
162 | do this in time. This means, though, that this happens for | |
163 | every pass again. */ | |
164 | goto try_stack; | |
165 | use_malloc = 1; | |
166 | } | |
167 | else | |
168 | { | |
169 | try_stack: | |
170 | idxarr = (int32_t *) alloca (srclen * sizeof (int32_t)); | |
171 | rulearr = (unsigned char *) alloca (srclen + 1); | |
172 | } | |
173 | ||
174 | idxmax = 0; | |
175 | do | |
176 | { | |
f3a6cc0a | 177 | int32_t tmp = findidx (&usrc, -1); |
ccadf7b5 UD |
178 | rulearr[idxmax] = tmp >> 24; |
179 | idxarr[idxmax] = tmp & 0xffffff; | |
180 | ||
181 | ++idxmax; | |
182 | } | |
183 | while (*usrc != L('\0')); | |
184 | ||
185 | /* This element is only read, the value never used but to determine | |
186 | another value which then is ignored. */ | |
187 | rulearr[idxmax] = '\0'; | |
188 | ||
189 | /* Now the passes over the weights. We now use the indeces we found | |
190 | before. */ | |
191 | needed = 0; | |
192 | for (pass = 0; pass < nrules; ++pass) | |
193 | { | |
194 | size_t backw_stop = ~0ul; | |
195 | int rule = rulesets[rulearr[0] * nrules + pass]; | |
196 | /* We assume that if a rule has defined `position' in one section | |
197 | this is true for all of them. */ | |
198 | int position = rule & sort_position; | |
199 | ||
2f334ad5 | 200 | last_needed = needed; |
ccadf7b5 UD |
201 | if (position == 0) |
202 | { | |
203 | for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) | |
204 | { | |
205 | if ((rule & sort_forward) != 0) | |
206 | { | |
207 | size_t len; | |
208 | ||
209 | if (backw_stop != ~0ul) | |
210 | { | |
211 | /* Handle the pushed elements now. */ | |
212 | size_t backw; | |
213 | ||
bc3a45ce | 214 | for (backw = idxcnt; backw > backw_stop; ) |
ccadf7b5 | 215 | { |
bc3a45ce | 216 | --backw; |
ccadf7b5 UD |
217 | len = weights[idxarr[backw]++]; |
218 | ||
219 | if (needed + len < n) | |
220 | while (len-- > 0) | |
221 | dest[needed++] = weights[idxarr[backw]++]; | |
222 | else | |
223 | { | |
224 | /* No more characters fit into the buffer. */ | |
225 | needed += len; | |
226 | idxarr[backw] += len; | |
227 | } | |
228 | } | |
229 | ||
230 | backw_stop = ~0ul; | |
231 | } | |
232 | ||
233 | /* Now handle the forward element. */ | |
234 | len = weights[idxarr[idxcnt]++]; | |
235 | if (needed + len < n) | |
236 | while (len-- > 0) | |
237 | dest[needed++] = weights[idxarr[idxcnt]++]; | |
238 | else | |
239 | { | |
240 | /* No more characters fit into the buffer. */ | |
241 | needed += len; | |
242 | idxarr[idxcnt] += len; | |
243 | } | |
244 | } | |
245 | else | |
246 | { | |
247 | /* Remember where the backwards series started. */ | |
248 | if (backw_stop == ~0ul) | |
249 | backw_stop = idxcnt; | |
250 | } | |
251 | ||
252 | rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; | |
253 | } | |
254 | ||
255 | ||
256 | if (backw_stop != ~0ul) | |
257 | { | |
258 | /* Handle the pushed elements now. */ | |
259 | size_t backw; | |
260 | ||
261 | backw = idxcnt; | |
262 | while (backw > backw_stop) | |
263 | { | |
264 | size_t len = weights[idxarr[--backw]++]; | |
265 | ||
266 | if (needed + len < n) | |
267 | while (len-- > 0) | |
268 | dest[needed++] = weights[idxarr[backw]++]; | |
269 | else | |
270 | { | |
271 | /* No more characters fit into the buffer. */ | |
272 | needed += len; | |
273 | idxarr[backw] += len; | |
274 | } | |
275 | } | |
276 | } | |
277 | } | |
278 | else | |
279 | { | |
280 | int val = 1; | |
281 | #ifndef WIDE_CHAR_VERSION | |
282 | char buf[7]; | |
283 | size_t buflen; | |
284 | #endif | |
285 | size_t i; | |
286 | ||
287 | for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) | |
288 | { | |
289 | if ((rule & sort_forward) != 0) | |
290 | { | |
291 | size_t len; | |
292 | ||
293 | if (backw_stop != ~0ul) | |
294 | { | |
295 | /* Handle the pushed elements now. */ | |
296 | size_t backw; | |
297 | ||
bc3a45ce | 298 | for (backw = idxcnt; backw > backw_stop; ) |
ccadf7b5 | 299 | { |
bc3a45ce | 300 | --backw; |
ccadf7b5 UD |
301 | len = weights[idxarr[backw]++]; |
302 | if (len != 0) | |
303 | { | |
304 | #ifdef WIDE_CHAR_VERSION | |
305 | if (needed + 1 + len < n) | |
306 | { | |
307 | dest[needed] = val; | |
308 | for (i = 0; i < len; ++i) | |
309 | dest[needed + 1 + i] = | |
310 | weights[idxarr[backw] + i]; | |
311 | } | |
312 | needed += 1 + len; | |
313 | #else | |
314 | buflen = utf8_encode (buf, val); | |
315 | if (needed + buflen + len < n) | |
316 | { | |
317 | for (i = 0; i < buflen; ++i) | |
318 | dest[needed + i] = buf[i]; | |
319 | for (i = 0; i < len; ++i) | |
320 | dest[needed + buflen + i] = | |
321 | weights[idxarr[backw] + i]; | |
322 | } | |
323 | needed += buflen + len; | |
324 | #endif | |
325 | idxarr[backw] += len; | |
326 | val = 1; | |
327 | } | |
328 | else | |
329 | ++val; | |
330 | } | |
331 | ||
332 | backw_stop = ~0ul; | |
333 | } | |
334 | ||
335 | /* Now handle the forward element. */ | |
336 | len = weights[idxarr[idxcnt]++]; | |
337 | if (len != 0) | |
338 | { | |
339 | #ifdef WIDE_CHAR_VERSION | |
340 | if (needed + 1+ len < n) | |
341 | { | |
342 | dest[needed] = val; | |
343 | for (i = 0; i < len; ++i) | |
344 | dest[needed + 1 + i] = | |
345 | weights[idxarr[idxcnt] + i]; | |
346 | } | |
347 | needed += 1 + len; | |
348 | #else | |
349 | buflen = utf8_encode (buf, val); | |
350 | if (needed + buflen + len < n) | |
351 | { | |
352 | for (i = 0; i < buflen; ++i) | |
353 | dest[needed + i] = buf[i]; | |
354 | for (i = 0; i < len; ++i) | |
355 | dest[needed + buflen + i] = | |
356 | weights[idxarr[idxcnt] + i]; | |
357 | } | |
358 | needed += buflen + len; | |
359 | #endif | |
360 | idxarr[idxcnt] += len; | |
361 | val = 1; | |
362 | } | |
363 | else | |
364 | /* Note that we don't have to increment `idxarr[idxcnt]' | |
365 | since the length is zero. */ | |
366 | ++val; | |
367 | } | |
368 | else | |
369 | { | |
370 | /* Remember where the backwards series started. */ | |
371 | if (backw_stop == ~0ul) | |
372 | backw_stop = idxcnt; | |
373 | } | |
374 | ||
375 | rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; | |
376 | } | |
377 | ||
378 | if (backw_stop != ~0ul) | |
379 | { | |
380 | /* Handle the pushed elements now. */ | |
381 | size_t backw; | |
382 | ||
383 | backw = idxmax - 1; | |
384 | while (backw > backw_stop) | |
385 | { | |
386 | size_t len = weights[idxarr[--backw]++]; | |
387 | if (len != 0) | |
388 | { | |
389 | #ifdef WIDE_CHAR_VERSION | |
390 | if (needed + 1 + len < n) | |
391 | { | |
392 | dest[needed] = val; | |
393 | for (i = 0; i < len; ++i) | |
394 | dest[needed + 1 + i] = | |
395 | weights[idxarr[backw] + i]; | |
396 | } | |
397 | needed += 1 + len; | |
398 | #else | |
399 | buflen = utf8_encode (buf, val); | |
400 | if (needed + buflen + len < n) | |
401 | { | |
402 | for (i = 0; i < buflen; ++i) | |
403 | dest[needed + i] = buf[i]; | |
404 | for (i = 0; i < len; ++i) | |
405 | dest[needed + buflen + i] = | |
406 | weights[idxarr[backw] + i]; | |
407 | } | |
408 | needed += buflen + len; | |
409 | #endif | |
410 | idxarr[backw] += len; | |
411 | val = 1; | |
412 | } | |
413 | else | |
414 | ++val; | |
415 | } | |
416 | } | |
417 | } | |
418 | ||
419 | /* Finally store the byte to separate the passes or terminate | |
420 | the string. */ | |
421 | if (needed < n) | |
422 | dest[needed] = pass + 1 < nrules ? L('\1') : L('\0'); | |
423 | ++needed; | |
424 | } | |
425 | ||
426 | /* This is a little optimization: many collation specifications have | |
427 | a `position' rule at the end and if no non-ignored character | |
428 | is found the last \1 byte is immediately followed by a \0 byte | |
429 | signalling this. We can avoid the \1 byte(s). */ | |
2f334ad5 | 430 | if (needed > 2 && needed == last_needed + 1) |
ccadf7b5 UD |
431 | { |
432 | /* Remove the \1 byte. */ | |
52a33795 | 433 | if (--needed <= n) |
2f334ad5 | 434 | dest[needed - 1] = L('\0'); |
ccadf7b5 UD |
435 | } |
436 | ||
437 | /* Free the memory if needed. */ | |
438 | if (use_malloc) | |
439 | free (idxarr); | |
440 | ||
441 | /* Return the number of bytes/words we need, but don't count the NUL | |
442 | byte/word at the end. */ | |
443 | return needed - 1; | |
444 | } | |
445 | libc_hidden_def (STRXFRM) | |
446 | ||
447 | #ifndef WIDE_CHAR_VERSION | |
1ab62b32 | 448 | weak_alias (__strxfrm_l, strxfrm_l) |
ccadf7b5 | 449 | #endif |