]> git.ipfire.org Git - thirdparty/glibc.git/blob - string/strcoll.c
Update.
[thirdparty/glibc.git] / string / strcoll.c
1 /* Copyright (C) 1995, 96, 97, 98, 99, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
26
27 #ifndef STRING_TYPE
28 # define STRING_TYPE char
29 # define USTRING_TYPE unsigned char
30 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
31 # define STRCOLL __strcoll_l
32 # else
33 # define STRCOLL strcoll
34 # endif
35 # define STRCMP strcmp
36 # define STRLEN strlen
37 # define WEIGHT_H "../locale/weight.h"
38 # define SUFFIX MB
39 # define L(arg) arg
40 #endif
41
42 #define CONCAT(a,b) CONCAT1(a,b)
43 #define CONCAT1(a,b) a##b
44
45 #include "../locale/localeinfo.h"
46
47 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
48 int
49 STRCOLL (s1, s2)
50 const STRING_TYPE *s1;
51 const STRING_TYPE *s2;
52 #else
53 int
54 STRCOLL (s1, s2, l)
55 const STRING_TYPE *s1;
56 const STRING_TYPE *s2;
57 __locale_t l;
58 #endif
59 {
60 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
61 struct locale_data *current = l->__locales[LC_COLLATE];
62 uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
63 #else
64 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
65 #endif
66 /* We don't assign the following values right away since it might be
67 unnecessary in case there are no rules. */
68 const unsigned char *rulesets;
69 const int32_t *table;
70 const USTRING_TYPE *weights;
71 const USTRING_TYPE *extra;
72 const int32_t *indirect;
73 uint_fast32_t pass;
74 int result = 0;
75 const USTRING_TYPE *us1;
76 const USTRING_TYPE *us2;
77 size_t s1len;
78 size_t s2len;
79 int32_t *idx1arr;
80 int32_t *idx2arr;
81 unsigned char *rule1arr;
82 unsigned char *rule2arr;
83 size_t idx1max;
84 size_t idx2max;
85 size_t idx1cnt;
86 size_t idx2cnt;
87 size_t idx1now;
88 size_t idx2now;
89 size_t backw1_stop;
90 size_t backw2_stop;
91 size_t backw1;
92 size_t backw2;
93 int val1;
94 int val2;
95 int position;
96 int seq1len;
97 int seq2len;
98 int use_malloc;
99
100 #include WEIGHT_H
101
102 if (nrules == 0)
103 return STRCMP (s1, s2);
104
105 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
106 rulesets = (const unsigned char *)
107 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
108 table = (const int32_t *)
109 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
110 weights = (const USTRING_TYPE *)
111 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
112 extra = (const USTRING_TYPE *)
113 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
114 indirect = (const int32_t *)
115 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
116 #else
117 rulesets = (const unsigned char *)
118 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
119 table = (const int32_t *)
120 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
121 weights = (const USTRING_TYPE *)
122 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
123 extra = (const USTRING_TYPE *)
124 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
125 indirect = (const int32_t *)
126 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
127 #endif
128 use_malloc = 0;
129
130 assert (((uintptr_t) table) % sizeof (table[0]) == 0);
131 assert (((uintptr_t) weights) % sizeof (weights[0]) == 0);
132 assert (((uintptr_t) extra) % sizeof (extra[0]) == 0);
133 assert (((uintptr_t) indirect) % sizeof (indirect[0]) == 0);
134
135 /* We need this a few times. */
136 s1len = STRLEN (s1);
137 s2len = STRLEN (s2);
138
139 /* We need the elements of the strings as unsigned values since they
140 are used as indeces. */
141 us1 = (const USTRING_TYPE *) s1;
142 us2 = (const USTRING_TYPE *) s2;
143
144 /* Perform the first pass over the string and while doing this find
145 and store the weights for each character. Since we want this to
146 be as fast as possible we are using `alloca' to store the temporary
147 values. But since there is no limit on the length of the string
148 we have to use `malloc' if the string is too long. We should be
149 very conservative here.
150
151 Please note that the localedef programs makes sure that `position'
152 is not used at the first level. */
153 if (s1len + s2len >= 16384)
154 {
155 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
156 idx2arr = &idx1arr[s2len];
157 rule1arr = (unsigned char *) &idx2arr[s2len];
158 rule2arr = &rule1arr[s1len];
159
160 if (idx1arr == NULL)
161 /* No memory. Well, go with the stack then.
162
163 XXX Once this implementation is stable we will handle this
164 differently. Instead of precomputing the indeces we will
165 do this in time. This means, though, that this happens for
166 every pass again. */
167 goto try_stack;
168 use_malloc = 1;
169 }
170 else
171 {
172 try_stack:
173 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
174 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
175 rule1arr = (unsigned char *) alloca (s2len);
176 rule2arr = (unsigned char *) alloca (s2len);
177 }
178
179 idx1cnt = 0;
180 idx2cnt = 0;
181 idx1max = 0;
182 idx2max = 0;
183 idx1now = 0;
184 idx2now = 0;
185 backw1_stop = ~0ul;
186 backw2_stop = ~0ul;
187 backw1 = ~0ul;
188 backw2 = ~0ul;
189 seq1len = 0;
190 seq2len = 0;
191 position = rulesets[0] & sort_position;
192 while (1)
193 {
194 val1 = 0;
195 val2 = 0;
196
197 /* Get the next non-IGNOREd element for string `s1'. */
198 if (seq1len == 0)
199 do
200 {
201 ++val1;
202
203 if (backw1_stop != ~0ul)
204 {
205 /* The is something pushed. */
206 if (backw1 == backw1_stop)
207 {
208 /* The last pushed character was handled. Continue
209 with forward characters. */
210 if (idx1cnt < idx1max)
211 idx1now = idx1cnt;
212 else
213 /* Nothing anymore. The backward sequence ended with
214 the last sequence in the string. Note that seq1len
215 is still zero. */
216 break;
217 }
218 else
219 idx1now = --backw1;
220 }
221 else
222 {
223 backw1_stop = idx1max;
224
225 while (*us1 != L('\0'))
226 {
227 int32_t tmp = findidx (&us1);
228 rule1arr[idx1max] = tmp >> 24;
229 idx1arr[idx1max] = tmp & 0xffffff;
230 idx1cnt = idx1max++;
231
232 if ((rulesets[rule1arr[idx1cnt] * nrules]
233 & sort_backward) == 0)
234 /* No more backward characters to push. */
235 break;
236 ++idx1cnt;
237 }
238
239 if (backw1_stop >= idx1cnt)
240 {
241 /* No sequence at all or just one. */
242 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
243 /* Note that seq1len is still zero. */
244 break;
245
246 backw1_stop = ~0ul;
247 idx1now = idx1cnt;
248 }
249 else
250 /* We pushed backward sequences. */
251 idx1now = backw1 = idx1cnt - 1;
252 }
253 }
254 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
255
256 /* And the same for string `s2'. */
257 if (seq2len == 0)
258 do
259 {
260 ++val2;
261
262 if (backw2_stop != ~0ul)
263 {
264 /* The is something pushed. */
265 if (backw2 == backw2_stop)
266 {
267 /* The last pushed character was handled. Continue
268 with forward characters. */
269 if (idx2cnt < idx2max)
270 idx2now = idx2cnt;
271 else
272 /* Nothing anymore. The backward sequence ended with
273 the last sequence in the string. Note that seq2len
274 is still zero. */
275 break;
276 }
277 else
278 idx2now = --backw2;
279 }
280 else
281 {
282 backw2_stop = idx2max;
283
284 while (*us2 != L('\0'))
285 {
286 int32_t tmp = findidx (&us2);
287 rule2arr[idx2max] = tmp >> 24;
288 idx2arr[idx2max] = tmp & 0xffffff;
289 idx2cnt = idx2max++;
290
291 if ((rulesets[rule2arr[idx2cnt] * nrules]
292 & sort_backward) == 0)
293 /* No more backward characters to push. */
294 break;
295 ++idx2cnt;
296 }
297
298 if (backw2_stop >= idx2cnt)
299 {
300 /* No sequence at all or just one. */
301 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
302 /* Note that seq1len is still zero. */
303 break;
304
305 backw2_stop = ~0ul;
306 idx2now = idx2cnt;
307 }
308 else
309 /* We pushed backward sequences. */
310 idx2now = backw2 = idx2cnt - 1;
311 }
312 }
313 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
314
315 /* See whether any or both strings are empty. */
316 if (seq1len == 0 || seq2len == 0)
317 {
318 if (seq1len == seq2len)
319 /* Both ended. So far so good, both strings are equal at the
320 first level. */
321 break;
322
323 /* This means one string is shorter than the other. Find out
324 which one and return an appropriate value. */
325 result = seq1len == 0 ? -1 : 1;
326 goto free_and_return;
327 }
328
329 /* Test for position if necessary. */
330 if (position && val1 != val2)
331 {
332 result = val1 - val2;
333 goto free_and_return;
334 }
335
336 /* Compare the two sequences. */
337 do
338 {
339 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
340 {
341 /* The sequences differ. */
342 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
343 goto free_and_return;
344 }
345
346 /* Increment the offsets. */
347 ++idx1arr[idx1now];
348 ++idx2arr[idx2now];
349
350 --seq1len;
351 --seq2len;
352 }
353 while (seq1len > 0 && seq2len > 0);
354
355 if (position && seq1len != seq2len)
356 {
357 result = seq1len - seq2len;
358 goto free_and_return;
359 }
360 }
361
362 /* Now the remaining passes over the weights. We now use the
363 indeces we found before. */
364 for (pass = 1; pass < nrules; ++pass)
365 {
366 /* We assume that if a rule has defined `position' in one section
367 this is true for all of them. */
368 idx1cnt = 0;
369 idx2cnt = 0;
370 backw1_stop = ~0ul;
371 backw2_stop = ~0ul;
372 backw1 = ~0ul;
373 backw2 = ~0ul;
374 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
375
376 while (1)
377 {
378 val1 = 0;
379 val2 = 0;
380
381 /* Get the next non-IGNOREd element for string `s1'. */
382 if (seq1len == 0)
383 do
384 {
385 ++val1;
386
387 if (backw1_stop != ~0ul)
388 {
389 /* The is something pushed. */
390 if (backw1 == backw1_stop)
391 {
392 /* The last pushed character was handled. Continue
393 with forward characters. */
394 if (idx1cnt < idx1max)
395 idx1now = idx1cnt;
396 else
397 {
398 /* Nothing anymore. The backward sequence
399 ended with the last sequence in the string. */
400 idx1now = ~0ul;
401 break;
402 }
403 }
404 else
405 idx1now = --backw1;
406 }
407 else
408 {
409 backw1_stop = idx1cnt;
410
411 while (idx1cnt < idx1max)
412 {
413 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
414 & sort_backward) == 0)
415 /* No more backward characters to push. */
416 break;
417 ++idx1cnt;
418 }
419
420 if (backw1_stop == idx1cnt)
421 {
422 /* No sequence at all or just one. */
423 if (idx1cnt == idx1max)
424 /* Note that seq2len is still zero. */
425 break;
426
427 backw1_stop = ~0ul;
428 idx1now = idx1cnt++;
429 }
430 else
431 /* We pushed backward sequences. */
432 idx1now = backw1 = idx1cnt - 1;
433 }
434 }
435 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
436
437 /* And the same for string `s2'. */
438 if (seq2len == 0)
439 do
440 {
441 ++val2;
442
443 if (backw2_stop != ~0ul)
444 {
445 /* The is something pushed. */
446 if (backw2 == backw2_stop)
447 {
448 /* The last pushed character was handled. Continue
449 with forward characters. */
450 if (idx2cnt < idx2max)
451 idx2now = idx2cnt;
452 else
453 {
454 /* Nothing anymore. The backward sequence
455 ended with the last sequence in the string. */
456 idx2now = ~0ul;
457 break;
458 }
459 }
460 else
461 idx2now = --backw2;
462 }
463 else
464 {
465 backw2_stop = idx2cnt;
466
467 while (idx2cnt < idx2max)
468 {
469 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
470 & sort_backward) == 0)
471 /* No more backward characters to push. */
472 break;
473 ++idx2cnt;
474 }
475
476 if (backw2_stop == idx2cnt)
477 {
478 /* No sequence at all or just one. */
479 if (idx2cnt == idx2max)
480 /* Note that seq2len is still zero. */
481 break;
482
483 backw2_stop = ~0ul;
484 idx2now = idx2cnt++;
485 }
486 else
487 /* We pushed backward sequences. */
488 idx2now = backw2 = idx2cnt - 1;
489 }
490 }
491 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
492
493 /* See whether any or both strings are empty. */
494 if (seq1len == 0 || seq2len == 0)
495 {
496 if (seq1len == seq2len)
497 /* Both ended. So far so good, both strings are equal
498 at this level. */
499 break;
500
501 /* This means one string is shorter than the other. Find out
502 which one and return an appropriate value. */
503 result = seq1len == 0 ? -1 : 1;
504 goto free_and_return;
505 }
506
507 /* Test for position if necessary. */
508 if (position && val1 != val2)
509 {
510 result = val1 - val2;
511 goto free_and_return;
512 }
513
514 /* Compare the two sequences. */
515 do
516 {
517 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
518 {
519 /* The sequences differ. */
520 result = (weights[idx1arr[idx1now]]
521 - weights[idx2arr[idx2now]]);
522 goto free_and_return;
523 }
524
525 /* Increment the offsets. */
526 ++idx1arr[idx1now];
527 ++idx2arr[idx2now];
528
529 --seq1len;
530 --seq2len;
531 }
532 while (seq1len > 0 && seq2len > 0);
533
534 if (position && seq1len != seq2len)
535 {
536 result = seq1len - seq2len;
537 goto free_and_return;
538 }
539 }
540 }
541
542 /* Free the memory if needed. */
543 free_and_return:
544 if (use_malloc)
545 free (idx1arr);
546
547 return result;
548 }