]> git.ipfire.org Git - thirdparty/glibc.git/blob - string/strcoll.c
Update.
[thirdparty/glibc.git] / string / strcoll.c
1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20 #include <langinfo.h>
21 #include <stddef.h>
22 #include <stdint.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26 #include "../locale/localeinfo.h"
27
28 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
29 # define STRCOLL __strcoll_l
30 #else
31 # define STRCOLL strcoll
32 #endif
33
34 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
35 int
36 STRCOLL (s1, s2)
37 const char *s1;
38 const char *s2;
39 #else
40 int
41 STRCOLL (s1, s2, l)
42 const char *s1;
43 const char *s2;
44 __locale_t l;
45 #endif
46 {
47 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
48 struct locale_data *current = l->__locales[LC_COLLATE];
49 uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
50 #else
51 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
52 #endif
53 /* We don't assign the following values right away since it might be
54 unnecessary in case there are no rules. */
55 const unsigned char *rulesets;
56 const int32_t *table;
57 const unsigned char *weights;
58 const unsigned char *extra;
59 const int32_t *indirect;
60 uint_fast32_t pass;
61 int result = 0;
62 const unsigned char *us1;
63 const unsigned char *us2;
64 size_t s1len;
65 size_t s2len;
66 int32_t *idx1arr;
67 int32_t *idx2arr;
68 unsigned char *rule1arr;
69 unsigned char *rule2arr;
70 size_t idx1max;
71 size_t idx2max;
72 size_t idx1cnt;
73 size_t idx2cnt;
74 size_t backw1_stop;
75 size_t backw2_stop;
76 size_t backw1;
77 size_t backw2;
78 int val1;
79 int val2;
80 int position;
81 int use_malloc = 0;
82
83 #include "../locale/weight.h"
84
85 if (nrules == 0)
86 return strcmp (s1, s2);
87
88 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
89 rulesets = (const unsigned char *)
90 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
91 table = (const int32_t *)
92 current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLEMB)].string;
93 weights = (const unsigned char *)
94 current->values[_NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB)].string;
95 extra = (const unsigned char *)
96 current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB)].string;
97 indirect = (const int32_t *)
98 current->values[_NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB)].string;
99 #else
100 rulesets = (const unsigned char *)
101 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
102 table = (const int32_t *)
103 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
104 weights = (const unsigned char *)
105 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
106 extra = (const unsigned char *)
107 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
108 indirect = (const int32_t *)
109 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
110 #endif
111
112 /* We need this a few times. */
113 s1len = strlen (s1);
114 s2len = strlen (s2);
115
116 /* We need the elements of the strings as unsigned values since they
117 are used as indeces. */
118 us1 = (const unsigned char *) s1;
119 us2 = (const unsigned char *) s2;
120
121 /* Perform the first pass over the string and while doing this find
122 and store the weights for each character. Since we want this to
123 be as fast as possible we are using `alloca' to store the temporary
124 values. But since there is no limit on the length of the string
125 we have to use `malloc' if the string is too long. We should be
126 very conservative here.
127
128 Please note that the localedef programs makes sure that `position'
129 is not used at the first level. */
130 if (s1len + s2len >= 16384)
131 {
132 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
133 idx2arr = &idx1arr[s2len];
134 rule1arr = (unsigned char *) &idx2arr[s2len];
135 rule2arr = &rule1arr[s1len];
136
137 if (idx1arr == NULL)
138 /* No memory. Well, go with the stack then.
139
140 XXX Once this implementation is stable we will handle this
141 differently. Instead of precomputing the indeces we will
142 do this in time. This means, though, that this happens for
143 every pass again. */
144 goto try_stack;
145 use_malloc = 1;
146 }
147 else
148 {
149 try_stack:
150 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
151 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
152 rule1arr = (unsigned char *) alloca (s2len);
153 rule2arr = (unsigned char *) alloca (s2len);
154 }
155
156 idx1cnt = 0;
157 idx2cnt = 0;
158 idx1max = 0;
159 idx2max = 0;
160 backw1_stop = ~0ul;
161 backw2_stop = ~0ul;
162 backw1 = ~0ul;
163 backw2 = ~0ul;
164 position = rulesets[rule1arr[0] * nrules] & sort_position;
165 while (1)
166 {
167 size_t idx1now;
168 size_t idx2now;
169 int seq1len;
170 int seq2len;
171 int i;
172
173 val1 = 0;
174 val2 = 0;
175
176 /* Get the next non-IGNOREd element for string `s1'. */
177 do
178 {
179 ++val1;
180
181 if (backw1_stop != ~0ul)
182 {
183 /* The is something pushed. */
184 if (backw1 == backw1_stop)
185 {
186 /* The last pushed character was handled. Continue
187 with forward characters. */
188 if (idx1cnt < idx1max)
189 idx1now = idx1cnt;
190 else
191 {
192 /* Nothing anymore. The backward sequence ended with
193 the last sequence in the string. */
194 idx1now = ~0ul;
195 break;
196 }
197 }
198 else
199 idx1now = --backw1;
200 }
201 else
202 {
203 backw1_stop = idx1max;
204
205 while (*us1 != '\0')
206 {
207 int32_t tmp = findidx (&us1);
208 rule1arr[idx1max] = tmp >> 24;
209 idx1arr[idx1max] = tmp & 0xffffff;
210 idx1cnt = idx1max++;
211
212 if ((rulesets[rule1arr[idx1cnt] * nrules]
213 & sort_backward) == 0)
214 /* No more backward characters to push. */
215 break;
216 ++idx1cnt;
217 }
218
219 if (backw1_stop >= idx1cnt)
220 {
221 /* No sequence at all or just one. */
222 backw1_stop = ~0ul;
223 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
224 {
225 idx1now = ~0ul;
226 break;
227 }
228 idx1now = idx1cnt;
229 }
230 else
231 /* We pushed backward sequences. */
232 idx1now = backw1 = idx1cnt - 1;
233 }
234 }
235 while (weights[idx1arr[idx1now]] == '\0');
236
237 /* And the same for string `s2'. */
238 do
239 {
240 ++val2;
241
242 if (backw2_stop != ~0ul)
243 {
244 /* The is something pushed. */
245 if (backw2 == backw2_stop)
246 {
247 /* The last pushed character was handled. Continue
248 with forward characters. */
249 if (idx2cnt < idx2max)
250 idx2now = idx2cnt;
251 else
252 {
253 /* Nothing anymore. The backward sequence ended with
254 the last sequence in the string. */
255 idx2now = ~0ul;
256 break;
257 }
258 }
259 else
260 idx2now = --backw2;
261 }
262 else
263 {
264 backw2_stop = idx2max;
265
266 while (*us2 != '\0')
267 {
268 int32_t tmp = findidx (&us2);
269 rule2arr[idx2max] = tmp >> 24;
270 idx2arr[idx2max] = tmp & 0xffffff;
271 idx2cnt = idx2max++;
272
273 if ((rulesets[rule2arr[idx2cnt] * nrules]
274 & sort_backward) == 0)
275 /* No more backward characters to push. */
276 break;
277 ++idx2cnt;
278 }
279
280 if (backw2_stop >= idx2cnt)
281 {
282 /* No sequence at all or just one. */
283 backw2_stop = ~0ul;
284 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
285 {
286 idx2now = ~0ul;
287 break;
288 }
289 idx2now = idx2cnt;
290 }
291 else
292 /* We pushed backward sequences. */
293 idx2now = backw2 = idx2cnt - 1;
294 }
295 }
296 while (weights[idx2arr[idx2now]] == '\0');
297
298 /* See whether any or both strings are empty. */
299 if (idx1now == ~0ul || idx2now == ~0ul)
300 {
301 if (idx1now == idx2now)
302 /* Both ended. So far so good, both strings are equal at the
303 first level. */
304 break;
305
306 /* This means one string is shorter than the other. Find out
307 which one and return an appropriate value. */
308 result = idx1now == ~0ul ? -1 : 1;
309 goto free_and_return;
310 }
311
312 /* Test for position if necessary. */
313 if (position && val1 != val2)
314 {
315 result = val1 - val2;
316 goto free_and_return;
317 }
318
319 /* Compare the two sequences. */
320 seq1len = weights[idx1arr[idx1now]++];
321 seq2len = weights[idx2arr[idx2now]++];
322 for (i = 0; i < seq1len && i < seq2len; ++i)
323 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
324 {
325 /* The sequences differ. */
326 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
327 goto free_and_return;
328 }
329 else
330 {
331 /* Increment the offsets. */
332 ++idx1arr[idx1now];
333 ++idx2arr[idx2now];
334 }
335
336 result = seq1len - seq2len;
337 if (result != 0)
338 goto free_and_return;
339 }
340
341 /* Now the remaining passes over the weights. We now use the
342 indeces we found before. */
343 for (pass = 1; pass < nrules; ++pass)
344 {
345 /* We assume that if a rule has defined `position' in one section
346 this is true for all of them. */
347 idx1cnt = 0;
348 idx2cnt = 0;
349 backw1_stop = ~0ul;
350 backw2_stop = ~0ul;
351 backw1 = ~0ul;
352 backw2 = ~0ul;
353 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
354
355 while (1)
356 {
357 size_t idx1now;
358 size_t idx2now;
359 int seq1len;
360 int seq2len;
361 int i;
362
363 val1 = 0;
364 val2 = 0;
365
366 /* Get the next non-IGNOREd element for string `s1'. */
367 do
368 {
369 ++val1;
370
371 if (backw1_stop != ~0ul)
372 {
373 /* The is something pushed. */
374 if (backw1 == backw1_stop)
375 {
376 /* The last pushed character was handled. Continue
377 with forward characters. */
378 if (idx1cnt < idx1max)
379 idx1now = idx1cnt;
380 else
381 {
382 /* Nothing anymore. The backward sequence ended with
383 the last sequence in the string. */
384 idx1now = ~0ul;
385 break;
386 }
387 }
388 else
389 idx1now = --backw1;
390 }
391 else
392 {
393 backw1_stop = idx1cnt;
394
395 while (idx1cnt < idx1max)
396 {
397 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
398 & sort_backward) == 0)
399 /* No more backward characters to push. */
400 break;
401 ++idx1cnt;
402 }
403
404 if (backw1_stop == idx1cnt)
405 {
406 /* No sequence at all or just one. */
407 backw1_stop = ~0ul;
408 if (idx1cnt == idx1max)
409 {
410 idx1now = ~0ul;
411 break;
412 }
413 idx1now = idx1cnt++;
414 }
415 else
416 /* We pushed backward sequences. */
417 idx1now = backw1 = idx1cnt - 1;
418 }
419 }
420 while (weights[idx1arr[idx1now]] == '\0');
421
422 /* And the same for string `s2'. */
423 do
424 {
425 ++val2;
426
427 if (backw2_stop != ~0ul)
428 {
429 /* The is something pushed. */
430 if (backw2 == backw2_stop)
431 {
432 /* The last pushed character was handled. Continue
433 with forward characters. */
434 if (idx2cnt < idx2max)
435 idx2now = idx2cnt;
436 else
437 {
438 /* Nothing anymore. The backward sequence ended with
439 the last sequence in the string. */
440 idx2now = ~0ul;
441 break;
442 }
443 }
444 else
445 idx2now = --backw2;
446 }
447 else
448 {
449 backw2_stop = idx2cnt;
450
451 while (idx2cnt < idx2max)
452 {
453 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
454 & sort_backward) == 0)
455 /* No more backward characters to push. */
456 break;
457 ++idx2cnt;
458 }
459
460 if (backw2_stop == idx2cnt)
461 {
462 /* No sequence at all or just one. */
463 backw2_stop = ~0ul;
464 if (idx2cnt == idx2max)
465 {
466 idx2now = ~0ul;
467 break;
468 }
469 idx2now = idx2cnt++;
470 }
471 else
472 /* We pushed backward sequences. */
473 idx2now = backw2 = idx2cnt - 1;
474 }
475 }
476 while (weights[idx2arr[idx2now]] == '\0');
477
478 /* See whether any or both strings are empty. */
479 if (idx1now == ~0ul || idx2now == ~0ul)
480 {
481 if (idx1now == idx2now)
482 /* Both ended. So far so good, both strings are equal at the
483 first level. */
484 break;
485
486 /* This means one string is shorter than the other. Find out
487 which one and return an appropriate value. */
488 result = idx1now == ~0ul ? -1 : 1;
489 goto free_and_return;
490 }
491
492 /* Test for position if necessary. */
493 if (position && val1 != val2)
494 {
495 result = val1 - val2;
496 goto free_and_return;
497 }
498
499 /* Compare the two sequences. */
500 seq1len = weights[idx1arr[idx1now]++];
501 seq2len = weights[idx2arr[idx2now]++];
502 for (i = 0; i < seq1len && i < seq2len; ++i)
503 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
504 {
505 /* The sequences differ. */
506 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
507 goto free_and_return;
508 }
509 else
510 {
511 /* Increment the offsets. */
512 ++idx1arr[idx1now];
513 ++idx2arr[idx2now];
514 }
515
516 result = seq1len - seq2len;
517 if (result != 0)
518 goto free_and_return;
519 }
520 }
521
522 /* Free the memory if needed. */
523 free_and_return:
524 if (use_malloc)
525 free (idx1arr);
526
527 return result;
528 }