]>
Commit | Line | Data |
---|---|---|
04277e02 | 1 | /* Copyright (C) 1995-2019 Free Software Foundation, Inc. |
c84142e8 | 2 | This file is part of the GNU C Library. |
1ab62b32 | 3 | Written by Ulrich Drepper <drepper@gnu.org>, 1995. |
c84142e8 UD |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
c84142e8 UD |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 13 | Lesser General Public License for more details. |
c84142e8 | 14 | |
41bdb6e2 | 15 | You should have received a copy of the GNU Lesser General Public |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
c84142e8 | 18 | |
1ab62b32 | 19 | |
ccadf7b5 UD |
20 | #include <assert.h> |
21 | #include <langinfo.h> | |
22 | #include <locale.h> | |
23 | #include <stddef.h> | |
24 | #include <stdint.h> | |
ccadf7b5 | 25 | #include <string.h> |
43d5c02c | 26 | #include <sys/param.h> |
f54d8f73 | 27 | #include <libc-diag.h> |
ccadf7b5 UD |
28 | |
29 | #ifndef STRING_TYPE | |
30 | # define STRING_TYPE char | |
31 | # define USTRING_TYPE unsigned char | |
32 | # define STRCOLL __strcoll_l | |
33 | # define STRCMP strcmp | |
ccadf7b5 UD |
34 | # define WEIGHT_H "../locale/weight.h" |
35 | # define SUFFIX MB | |
36 | # define L(arg) arg | |
37 | #endif | |
38 | ||
39 | #define CONCAT(a,b) CONCAT1(a,b) | |
40 | #define CONCAT1(a,b) a##b | |
41 | ||
42 | #include "../locale/localeinfo.h" | |
8c0ab919 | 43 | #include WEIGHT_H |
ccadf7b5 | 44 | |
1326ba1a SP |
45 | /* Track status while looking for sequences in a string. */ |
46 | typedef struct | |
47 | { | |
48 | int len; /* Length of the current sequence. */ | |
141f3a77 | 49 | size_t val; /* Position of the sequence relative to the |
1326ba1a | 50 | previous non-ignored sequence. */ |
1326ba1a SP |
51 | size_t idxmax; /* Maximum index in sequences. */ |
52 | size_t idxcnt; /* Current count of indices. */ | |
53 | size_t backw; /* Current Backward sequence index. */ | |
54 | size_t backw_stop; /* Index where the backward sequences stop. */ | |
55 | const USTRING_TYPE *us; /* The string. */ | |
141f3a77 SP |
56 | unsigned char rule; /* Saved rule for the first sequence. */ |
57 | int32_t idx; /* Index to weight of the current sequence. */ | |
58 | int32_t save_idx; /* Save looked up index of a forward | |
59 | sequence after the last backward | |
60 | sequence. */ | |
61 | const USTRING_TYPE *back_us; /* Beginning of the backward sequence. */ | |
1326ba1a SP |
62 | } coll_seq; |
63 | ||
1326ba1a | 64 | /* Get next sequence. Traverse the string as required. */ |
5d178c37 | 65 | static __always_inline void |
1326ba1a | 66 | get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, |
33cc770b SP |
67 | const USTRING_TYPE *weights, const int32_t *table, |
68 | const USTRING_TYPE *extra, const int32_t *indirect, | |
69 | int pass) | |
141f3a77 | 70 | { |
141f3a77 SP |
71 | size_t val = seq->val = 0; |
72 | int len = seq->len; | |
73 | size_t backw_stop = seq->backw_stop; | |
74 | size_t backw = seq->backw; | |
75 | size_t idxcnt = seq->idxcnt; | |
76 | size_t idxmax = seq->idxmax; | |
77 | int32_t idx = seq->idx; | |
78 | const USTRING_TYPE *us = seq->us; | |
79 | ||
80 | while (len == 0) | |
81 | { | |
82 | ++val; | |
83 | if (backw_stop != ~0ul) | |
84 | { | |
85 | /* There is something pushed. */ | |
86 | if (backw == backw_stop) | |
87 | { | |
88 | /* The last pushed character was handled. Continue | |
89 | with forward characters. */ | |
90 | if (idxcnt < idxmax) | |
91 | { | |
92 | idx = seq->save_idx; | |
93 | backw_stop = ~0ul; | |
94 | } | |
95 | else | |
96 | { | |
97 | /* Nothing anymore. The backward sequence ended with | |
98 | the last sequence in the string. Note that len is | |
99 | still zero. */ | |
100 | idx = 0; | |
101 | break; | |
102 | } | |
103 | } | |
104 | else | |
105 | { | |
106 | /* XXX Traverse BACKW sequences from the beginning of | |
107 | BACKW_STOP to get the next sequence. Is ther a quicker way | |
108 | to do this? */ | |
109 | size_t i = backw_stop; | |
110 | us = seq->back_us; | |
111 | while (i < backw) | |
112 | { | |
8c0ab919 | 113 | int32_t tmp = findidx (table, indirect, extra, &us, -1); |
141f3a77 SP |
114 | idx = tmp & 0xffffff; |
115 | i++; | |
116 | } | |
117 | --backw; | |
118 | us = seq->us; | |
119 | } | |
120 | } | |
121 | else | |
122 | { | |
123 | backw_stop = idxmax; | |
124 | int32_t prev_idx = idx; | |
125 | ||
126 | while (*us != L('\0')) | |
127 | { | |
8c0ab919 | 128 | int32_t tmp = findidx (table, indirect, extra, &us, -1); |
141f3a77 SP |
129 | unsigned char rule = tmp >> 24; |
130 | prev_idx = idx; | |
131 | idx = tmp & 0xffffff; | |
132 | idxcnt = idxmax++; | |
133 | ||
134 | /* Save the rule for the first sequence. */ | |
135 | if (__glibc_unlikely (idxcnt == 0)) | |
136 | seq->rule = rule; | |
137 | ||
138 | if ((rulesets[rule * nrules + pass] | |
139 | & sort_backward) == 0) | |
140 | /* No more backward characters to push. */ | |
141 | break; | |
142 | ++idxcnt; | |
143 | } | |
144 | ||
145 | if (backw_stop >= idxcnt) | |
146 | { | |
147 | /* No sequence at all or just one. */ | |
148 | if (idxcnt == idxmax || backw_stop > idxcnt) | |
149 | /* Note that len is still zero. */ | |
150 | break; | |
151 | ||
152 | backw_stop = ~0ul; | |
153 | } | |
154 | else | |
155 | { | |
156 | /* We pushed backward sequences. If the stream ended with the | |
157 | backward sequence, then we process the last sequence we | |
158 | found. Otherwise we process the sequence before the last | |
159 | one since the last one was a forward sequence. */ | |
160 | seq->back_us = seq->us; | |
161 | seq->us = us; | |
162 | backw = idxcnt; | |
163 | if (idxmax > idxcnt) | |
164 | { | |
165 | backw--; | |
166 | seq->save_idx = idx; | |
167 | idx = prev_idx; | |
168 | } | |
169 | if (backw > backw_stop) | |
170 | backw--; | |
171 | } | |
172 | } | |
173 | ||
93fe09cb CD |
174 | /* With GCC 5.3 when compiling with -Os the compiler complains |
175 | that idx, taken from seq->idx (seq1 or seq2 from STRCOLL) may | |
176 | be used uninitialized. In general this can't possibly be true | |
177 | since seq1.idx and seq2.idx are initialized to zero in the | |
178 | outer function. Only one case where seq->idx is restored from | |
179 | seq->save_idx might result in an uninitialized idx value, but | |
180 | it is guarded by a sequence of checks against backw_stop which | |
181 | ensures that seq->save_idx was saved to first and contains a | |
182 | valid value. */ | |
183 | DIAG_PUSH_NEEDS_COMMENT; | |
184 | DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized"); | |
141f3a77 | 185 | len = weights[idx++]; |
93fe09cb | 186 | DIAG_POP_NEEDS_COMMENT; |
141f3a77 SP |
187 | /* Skip over indices of previous levels. */ |
188 | for (int i = 0; i < pass; i++) | |
189 | { | |
190 | idx += len; | |
191 | len = weights[idx]; | |
192 | idx++; | |
193 | } | |
194 | } | |
195 | ||
196 | /* Update the structure. */ | |
197 | seq->val = val; | |
198 | seq->len = len; | |
199 | seq->backw_stop = backw_stop; | |
200 | seq->backw = backw; | |
201 | seq->idxcnt = idxcnt; | |
202 | seq->idxmax = idxmax; | |
203 | seq->us = us; | |
204 | seq->idx = idx; | |
205 | } | |
206 | ||
0742aef6 | 207 | /* Compare two sequences. */ |
5d178c37 | 208 | static __always_inline int |
0742aef6 | 209 | do_compare (coll_seq *seq1, coll_seq *seq2, int position, |
33cc770b | 210 | const USTRING_TYPE *weights) |
141f3a77 SP |
211 | { |
212 | int seq1len = seq1->len; | |
213 | int seq2len = seq2->len; | |
214 | size_t val1 = seq1->val; | |
215 | size_t val2 = seq2->val; | |
216 | int idx1 = seq1->idx; | |
217 | int idx2 = seq2->idx; | |
218 | int result = 0; | |
219 | ||
220 | /* Test for position if necessary. */ | |
221 | if (position && val1 != val2) | |
222 | { | |
223 | result = val1 > val2 ? 1 : -1; | |
224 | goto out; | |
225 | } | |
226 | ||
227 | /* Compare the two sequences. */ | |
228 | do | |
229 | { | |
230 | if (weights[idx1] != weights[idx2]) | |
231 | { | |
232 | /* The sequences differ. */ | |
233 | result = weights[idx1] - weights[idx2]; | |
234 | goto out; | |
235 | } | |
236 | ||
237 | /* Increment the offsets. */ | |
238 | ++idx1; | |
239 | ++idx2; | |
240 | ||
241 | --seq1len; | |
242 | --seq2len; | |
243 | } | |
244 | while (seq1len > 0 && seq2len > 0); | |
245 | ||
246 | if (position && seq1len != seq2len) | |
247 | result = seq1len - seq2len; | |
248 | ||
249 | out: | |
250 | seq1->len = seq1len; | |
251 | seq2->len = seq2len; | |
252 | seq1->idx = idx1; | |
253 | seq2->idx = idx2; | |
254 | return result; | |
255 | } | |
256 | ||
ccadf7b5 | 257 | int |
af85385f | 258 | STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, locale_t l) |
ccadf7b5 | 259 | { |
f095bb72 | 260 | struct __locale_data *current = l->__locales[LC_COLLATE]; |
ccadf7b5 UD |
261 | uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word; |
262 | /* We don't assign the following values right away since it might be | |
263 | unnecessary in case there are no rules. */ | |
264 | const unsigned char *rulesets; | |
265 | const int32_t *table; | |
266 | const USTRING_TYPE *weights; | |
267 | const USTRING_TYPE *extra; | |
268 | const int32_t *indirect; | |
ccadf7b5 UD |
269 | |
270 | if (nrules == 0) | |
271 | return STRCMP (s1, s2); | |
272 | ||
0742aef6 LH |
273 | /* Catch empty strings. */ |
274 | if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0')) | |
275 | return (*s1 != '\0') - (*s2 != '\0'); | |
276 | ||
ccadf7b5 UD |
277 | rulesets = (const unsigned char *) |
278 | current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string; | |
279 | table = (const int32_t *) | |
280 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string; | |
281 | weights = (const USTRING_TYPE *) | |
282 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string; | |
283 | extra = (const USTRING_TYPE *) | |
284 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string; | |
285 | indirect = (const int32_t *) | |
286 | current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string; | |
ccadf7b5 UD |
287 | |
288 | assert (((uintptr_t) table) % __alignof__ (table[0]) == 0); | |
289 | assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0); | |
290 | assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0); | |
291 | assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0); | |
292 | ||
0742aef6 | 293 | int result = 0, rule = 0; |
1326ba1a | 294 | |
f54d8f73 JM |
295 | /* With GCC 7 when compiling with -Os the compiler warns that |
296 | seq1.back_us and seq2.back_us might be used uninitialized. | |
297 | Sometimes this warning appears at locations in locale/weightwc.h | |
298 | where the actual use is, but on architectures other than x86_64, | |
299 | x86 and s390x, a warning appears at the definitions of seq1 and | |
300 | seq2. This uninitialized use is impossible for the same reason | |
301 | as described in comments in locale/weightwc.h. */ | |
302 | DIAG_PUSH_NEEDS_COMMENT; | |
303 | DIAG_IGNORE_Os_NEEDS_COMMENT (7, "-Wmaybe-uninitialized"); | |
1326ba1a | 304 | coll_seq seq1, seq2; |
f54d8f73 | 305 | DIAG_POP_NEEDS_COMMENT; |
ef635a29 LH |
306 | seq1.len = 0; |
307 | seq1.idxmax = 0; | |
308 | seq1.rule = 0; | |
309 | seq2.len = 0; | |
310 | seq2.idxmax = 0; | |
1326ba1a | 311 | |
1326ba1a | 312 | for (int pass = 0; pass < nrules; ++pass) |
ccadf7b5 | 313 | { |
1326ba1a | 314 | seq1.idxcnt = 0; |
141f3a77 SP |
315 | seq1.idx = 0; |
316 | seq2.idx = 0; | |
1326ba1a SP |
317 | seq1.backw_stop = ~0ul; |
318 | seq1.backw = ~0ul; | |
319 | seq2.idxcnt = 0; | |
320 | seq2.backw_stop = ~0ul; | |
321 | seq2.backw = ~0ul; | |
322 | ||
141f3a77 SP |
323 | /* We need the elements of the strings as unsigned values since they |
324 | are used as indices. */ | |
325 | seq1.us = (const USTRING_TYPE *) s1; | |
326 | seq2.us = (const USTRING_TYPE *) s2; | |
327 | ||
ccadf7b5 | 328 | /* We assume that if a rule has defined `position' in one section |
0742aef6 LH |
329 | this is true for all of them. Please note that the localedef programs |
330 | makes sure that `position' is not used at the first level. */ | |
331 | ||
141f3a77 | 332 | int position = rulesets[rule * nrules + pass] & sort_position; |
ccadf7b5 UD |
333 | |
334 | while (1) | |
335 | { | |
0742aef6 | 336 | get_next_seq (&seq1, nrules, rulesets, weights, table, |
141f3a77 | 337 | extra, indirect, pass); |
0742aef6 | 338 | get_next_seq (&seq2, nrules, rulesets, weights, table, |
141f3a77 | 339 | extra, indirect, pass); |
ccadf7b5 | 340 | /* See whether any or both strings are empty. */ |
1326ba1a | 341 | if (seq1.len == 0 || seq2.len == 0) |
ccadf7b5 | 342 | { |
1326ba1a | 343 | if (seq1.len == seq2.len) |
0742aef6 LH |
344 | { |
345 | /* Both strings ended and are equal at this level. Do a | |
346 | byte-level comparison to ensure that we don't waste time | |
347 | going through multiple passes for totally equal strings | |
348 | before proceeding to subsequent passes. */ | |
87701a58 | 349 | if (pass == 0 && STRCMP (s1, s2) == 0) |
0742aef6 LH |
350 | return result; |
351 | else | |
352 | break; | |
353 | } | |
ccadf7b5 UD |
354 | |
355 | /* This means one string is shorter than the other. Find out | |
356 | which one and return an appropriate value. */ | |
0742aef6 | 357 | return seq1.len == 0 ? -1 : 1; |
ccadf7b5 UD |
358 | } |
359 | ||
0742aef6 | 360 | result = do_compare (&seq1, &seq2, position, weights); |
1326ba1a | 361 | if (result != 0) |
0742aef6 | 362 | return result; |
ccadf7b5 | 363 | } |
141f3a77 | 364 | |
0742aef6 | 365 | rule = seq1.rule; |
ccadf7b5 UD |
366 | } |
367 | ||
ccadf7b5 UD |
368 | return result; |
369 | } | |
370 | libc_hidden_def (STRCOLL) | |
371 | ||
372 | #ifndef WIDE_CHAR_VERSION | |
1ab62b32 | 373 | weak_alias (__strcoll_l, strcoll_l) |
ccadf7b5 | 374 | #endif |