]> git.ipfire.org Git - thirdparty/glibc.git/blob - string/strxfrm.c
Update.
[thirdparty/glibc.git] / string / strxfrm.c
1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <sys/param.h>
27
28 #ifndef STRING_TYPE
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
32 # define STRXFRM __strxfrm_l
33 # else
34 # define STRXFRM strxfrm
35 # endif
36 # define STRCMP strcmp
37 # define STRLEN strlen
38 # define STPNCPY __stpncpy
39 # define WEIGHT_H "../locale/weight.h"
40 # define SUFFIX MB
41 # define L(arg) arg
42 #endif
43
44 #define CONCAT(a,b) CONCAT1(a,b)
45 #define CONCAT1(a,b) a##b
46
47 #include "../locale/localeinfo.h"
48
49
50 #ifndef WIDE_CHAR_VERSION
51
52 /* We need UTF-8 encoding of numbers. */
53 static inline int
54 utf8_encode (char *buf, int val)
55 {
56 int retval;
57
58 if (val < 0x80)
59 {
60 *buf++ = (char) val;
61 retval = 1;
62 }
63 else
64 {
65 int step;
66
67 for (step = 2; step < 6; ++step)
68 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
69 break;
70 retval = step;
71
72 *buf = (unsigned char) (~0xff >> step);
73 --step;
74 do
75 {
76 buf[step] = 0x80 | (val & 0x3f);
77 val >>= 6;
78 }
79 while (--step > 0);
80 *buf |= val;
81 }
82
83 return retval;
84 }
85 #endif
86
87
88 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
89 size_t
90 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
91 #else
92 size_t
93 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
94 #endif
95 {
96 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
97 struct locale_data *current = l->__locales[LC_COLLATE];
98 uint_fast32_t nrules = *((const uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
99 #else
100 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
101 #endif
102 /* We don't assign the following values right away since it might be
103 unnecessary in case there are no rules. */
104 const unsigned char *rulesets;
105 const int32_t *table;
106 const USTRING_TYPE *weights;
107 const USTRING_TYPE *extra;
108 const int32_t *indirect;
109 uint_fast32_t pass;
110 size_t needed;
111 const USTRING_TYPE *usrc;
112 size_t srclen = STRLEN (src);
113 int32_t *idxarr;
114 unsigned char *rulearr;
115 size_t idxmax;
116 size_t idxcnt;
117 int use_malloc;
118
119 #include WEIGHT_H
120
121 if (nrules == 0)
122 {
123 if (n != 0)
124 STPNCPY (dest, src, MIN (srclen + 1, n));
125
126 return srclen;
127 }
128
129 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
130 rulesets = (const unsigned char *)
131 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
132 table = (const int32_t *)
133 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
134 weights = (const USTRING_TYPE *)
135 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
136 extra = (const USTRING_TYPE *)
137 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
138 indirect = (const int32_t *)
139 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
140 #else
141 rulesets = (const unsigned char *)
142 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
143 table = (const int32_t *)
144 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
145 weights = (const USTRING_TYPE *)
146 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
147 extra = (const USTRING_TYPE *)
148 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
149 indirect = (const int32_t *)
150 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
151 #endif
152 use_malloc = 0;
153
154 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
155 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
156 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
157 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
158
159 /* Handle an empty string as a special case. */
160 if (srclen == 0)
161 {
162 if (n != 0)
163 *dest = L('\0');
164 return 1;
165 }
166
167 /* We need the elements of the string as unsigned values since they
168 are used as indeces. */
169 usrc = (const USTRING_TYPE *) src;
170
171 /* Perform the first pass over the string and while doing this find
172 and store the weights for each character. Since we want this to
173 be as fast as possible we are using `alloca' to store the temporary
174 values. But since there is no limit on the length of the string
175 we have to use `malloc' if the string is too long. We should be
176 very conservative here. */
177 if (srclen >= 16384)
178 {
179 idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
180 rulearr = (unsigned char *) &idxarr[srclen];
181
182 if (idxarr == NULL)
183 /* No memory. Well, go with the stack then.
184
185 XXX Once this implementation is stable we will handle this
186 differently. Instead of precomputing the indeces we will
187 do this in time. This means, though, that this happens for
188 every pass again. */
189 goto try_stack;
190 use_malloc = 1;
191 }
192 else
193 {
194 try_stack:
195 idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
196 rulearr = (unsigned char *) alloca (srclen);
197 }
198
199 idxmax = 0;
200 do
201 {
202 int32_t tmp = findidx (&usrc);
203 rulearr[idxmax] = tmp >> 24;
204 idxarr[idxmax] = tmp & 0xffffff;
205
206 ++idxmax;
207 }
208 while (*usrc != L('\0'));
209
210 /* Now the passes over the weights. We now use the indeces we found
211 before. */
212 needed = 0;
213 for (pass = 0; pass < nrules; ++pass)
214 {
215 size_t backw_stop = ~0ul;
216 int rule = rulesets[rulearr[0] * nrules + pass];
217 /* We assume that if a rule has defined `position' in one section
218 this is true for all of them. */
219 int position = rule & sort_position;
220
221 if (position == 0)
222 {
223 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
224 {
225 if ((rule & sort_forward) != 0)
226 {
227 size_t len;
228
229 if (backw_stop != ~0ul)
230 {
231 /* Handle the pushed elements now. */
232 size_t backw;
233
234 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
235 {
236 len = weights[idxarr[backw]++];
237
238 if (needed + len < n)
239 while (len-- > 0)
240 dest[needed++] = weights[idxarr[backw]++];
241 else
242 {
243 /* No more characters fit into the buffer. */
244 needed += len;
245 idxarr[backw] += len;
246 }
247 }
248
249 backw_stop = ~0ul;
250 }
251
252 /* Now handle the forward element. */
253 len = weights[idxarr[idxcnt]++];
254 if (needed + len < n)
255 while (len-- > 0)
256 dest[needed++] = weights[idxarr[idxcnt]++];
257 else
258 {
259 /* No more characters fit into the buffer. */
260 needed += len;
261 idxarr[idxcnt] += len;
262 }
263 }
264 else
265 {
266 /* Remember where the backwards series started. */
267 if (backw_stop == ~0ul)
268 backw_stop = idxcnt;
269 }
270
271 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
272 }
273
274
275 if (backw_stop != ~0ul)
276 {
277 /* Handle the pushed elements now. */
278 size_t backw;
279
280 backw = idxcnt;
281 while (backw > backw_stop)
282 {
283 size_t len = weights[idxarr[--backw]++];
284
285 if (needed + len < n)
286 while (len-- > 0)
287 dest[needed++] = weights[idxarr[backw]++];
288 else
289 {
290 /* No more characters fit into the buffer. */
291 needed += len;
292 idxarr[backw] += len;
293 }
294 }
295 }
296 }
297 else
298 {
299 int val = 1;
300 #ifndef WIDE_CHAR_VERSION
301 char buf[7];
302 size_t buflen;
303 #endif
304 size_t i;
305
306 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
307 {
308 if ((rule & sort_forward) != 0)
309 {
310 size_t len;
311
312 if (backw_stop != ~0ul)
313 {
314 /* Handle the pushed elements now. */
315 size_t backw;
316
317 for (backw = idxcnt - 1; backw >= backw_stop; --backw)
318 {
319 len = weights[idxarr[backw]++];
320 if (len != 0)
321 {
322 #ifdef WIDE_CHAR_VERSION
323 if (needed + 1 + len < n)
324 {
325 dest[needed] = val;
326 for (i = 0; i < len; ++i)
327 dest[needed + 1 + i] =
328 weights[idxarr[backw] + i];
329 }
330 needed += 1 + len;
331 #else
332 buflen = utf8_encode (buf, val);
333 if (needed + buflen + len < n)
334 {
335 for (i = 0; i < buflen; ++i)
336 dest[needed + i] = buf[i];
337 for (i = 0; i < len; ++i)
338 dest[needed + buflen + i] =
339 weights[idxarr[backw] + i];
340 }
341 needed += buflen + len;
342 #endif
343 idxarr[backw] += len;
344 val = 1;
345 }
346 else
347 ++val;
348 }
349
350 backw_stop = ~0ul;
351 }
352
353 /* Now handle the forward element. */
354 len = weights[idxarr[idxcnt]++];
355 if (len != 0)
356 {
357 #ifdef WIDE_CHAR_VERSION
358 if (needed + 1+ len < n)
359 {
360 dest[needed] = val;
361 for (i = 0; i < len; ++i)
362 dest[needed + 1 + i] =
363 weights[idxarr[idxcnt] + i];
364 }
365 needed += 1 + len;
366 #else
367 buflen = utf8_encode (buf, val);
368 if (needed + buflen + len < n)
369 {
370 for (i = 0; i < buflen; ++i)
371 dest[needed + i] = buf[i];
372 for (i = 0; i < len; ++i)
373 dest[needed + buflen + i] =
374 weights[idxarr[idxcnt] + i];
375 }
376 needed += buflen + len;
377 #endif
378 idxarr[idxcnt] += len;
379 val = 1;
380 }
381 else
382 /* Note that we don't have to increment `idxarr[idxcnt]'
383 since the length is zero. */
384 ++val;
385 }
386 else
387 {
388 /* Remember where the backwards series started. */
389 if (backw_stop == ~0ul)
390 backw_stop = idxcnt;
391 }
392
393 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
394 }
395
396 if (backw_stop != ~0ul)
397 {
398 /* Handle the pushed elements now. */
399 size_t backw;
400
401 backw = idxmax - 1;
402 while (backw > backw_stop)
403 {
404 size_t len = weights[idxarr[--backw]++];
405 if (len != 0)
406 {
407 #ifdef WIDE_CHAR_VERSION
408 if (needed + 1 + len < n)
409 {
410 dest[needed] = val;
411 for (i = 0; i < len; ++i)
412 dest[needed + 1 + i] =
413 weights[idxarr[backw] + i];
414 }
415 needed += 1 + len;
416 #else
417 buflen = utf8_encode (buf, val);
418 if (needed + buflen + len < n)
419 {
420 for (i = 0; i < buflen; ++i)
421 dest[needed + i] = buf[i];
422 for (i = 0; i < len; ++i)
423 dest[needed + buflen + i] =
424 weights[idxarr[backw] + i];
425 }
426 needed += buflen + len;
427 #endif
428 idxarr[backw] += len;
429 val = 1;
430 }
431 else
432 ++val;
433 }
434 }
435 }
436
437 /* Finally store the byte to separate the passes or terminate
438 the string. */
439 if (needed < n)
440 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
441 ++needed;
442 }
443
444 /* This is a little optimization: many collation specifications have
445 a `position' rule at the end and if no non-ignored character
446 is found the last \1 byte is immediately followed by a \0 byte
447 signalling this. We can avoid the \1 byte(s). */
448 if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
449 {
450 /* Remove the \1 byte. */
451 --needed;
452 dest[needed - 1] = L('\0');
453 }
454
455 /* Free the memory if needed. */
456 if (use_malloc)
457 free (idxarr);
458
459 /* Return the number of bytes/words we need, but don't count the NUL
460 byte/word at the end. */
461 return needed - 1;
462 }