]> git.ipfire.org Git - thirdparty/glibc.git/blame - string/strxfrm_l.c
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / string / strxfrm_l.c
CommitLineData
d4697bc9 1/* Copyright (C) 1995-2014 Free Software Foundation, Inc.
c84142e8 2 This file is part of the GNU C Library.
1ab62b32 3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
c84142e8
UD
4
5 The GNU C Library is free software; you can redistribute it and/or
41bdb6e2
AJ
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
c84142e8
UD
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2 13 Lesser General Public License for more details.
c84142e8 14
41bdb6e2 15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
c84142e8 18
ccadf7b5
UD
19#include <assert.h>
20#include <langinfo.h>
21#include <locale.h>
22#include <stddef.h>
23#include <stdint.h>
24#include <stdlib.h>
25#include <string.h>
26#include <sys/param.h>
1ab62b32 27
ccadf7b5
UD
28#ifndef STRING_TYPE
29# define STRING_TYPE char
30# define USTRING_TYPE unsigned char
31# define STRXFRM __strxfrm_l
32# define STRCMP strcmp
33# define STRLEN strlen
34# define STPNCPY __stpncpy
35# define WEIGHT_H "../locale/weight.h"
36# define SUFFIX MB
37# define L(arg) arg
38#endif
39
40#define CONCAT(a,b) CONCAT1(a,b)
41#define CONCAT1(a,b) a##b
42
43#include "../locale/localeinfo.h"
44
45
46#ifndef WIDE_CHAR_VERSION
47
48/* We need UTF-8 encoding of numbers. */
49static int
50utf8_encode (char *buf, int val)
51{
52 int retval;
53
54 if (val < 0x80)
55 {
56 *buf++ = (char) val;
57 retval = 1;
58 }
59 else
60 {
61 int step;
62
63 for (step = 2; step < 6; ++step)
64 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
65 break;
66 retval = step;
67
68 *buf = (unsigned char) (~0xff >> step);
69 --step;
70 do
71 {
72 buf[step] = 0x80 | (val & 0x3f);
73 val >>= 6;
74 }
75 while (--step > 0);
76 *buf |= val;
77 }
78
79 return retval;
80}
81#endif
82
83
84size_t
85STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
86{
f095bb72 87 struct __locale_data *current = l->__locales[LC_COLLATE];
ccadf7b5
UD
88 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
89 /* We don't assign the following values right away since it might be
90 unnecessary in case there are no rules. */
91 const unsigned char *rulesets;
92 const int32_t *table;
93 const USTRING_TYPE *weights;
94 const USTRING_TYPE *extra;
95 const int32_t *indirect;
96 uint_fast32_t pass;
97 size_t needed;
2f334ad5 98 size_t last_needed;
ccadf7b5
UD
99 const USTRING_TYPE *usrc;
100 size_t srclen = STRLEN (src);
101 int32_t *idxarr;
102 unsigned char *rulearr;
103 size_t idxmax;
104 size_t idxcnt;
105 int use_malloc;
106
107#include WEIGHT_H
108
109 if (nrules == 0)
110 {
111 if (n != 0)
112 STPNCPY (dest, src, MIN (srclen + 1, n));
113
114 return srclen;
115 }
116
117 rulesets = (const unsigned char *)
118 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
119 table = (const int32_t *)
120 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
121 weights = (const USTRING_TYPE *)
122 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
123 extra = (const USTRING_TYPE *)
124 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
125 indirect = (const int32_t *)
126 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
127 use_malloc = 0;
128
129 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
130 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
131 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
132 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
133
134 /* Handle an empty string as a special case. */
135 if (srclen == 0)
136 {
137 if (n != 0)
f095bb72 138 *dest = L('\0');
ccadf7b5
UD
139 return 0;
140 }
141
142 /* We need the elements of the string as unsigned values since they
143 are used as indeces. */
144 usrc = (const USTRING_TYPE *) src;
145
146 /* Perform the first pass over the string and while doing this find
147 and store the weights for each character. Since we want this to
148 be as fast as possible we are using `alloca' to store the temporary
149 values. But since there is no limit on the length of the string
150 we have to use `malloc' if the string is too long. We should be
151 very conservative here. */
e0e72284 152 if (! __libc_use_alloca ((srclen + 1) * (sizeof (int32_t) + 1)))
ccadf7b5
UD
153 {
154 idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
155 rulearr = (unsigned char *) &idxarr[srclen];
156
157 if (idxarr == NULL)
158 /* No memory. Well, go with the stack then.
159
160 XXX Once this implementation is stable we will handle this
161 differently. Instead of precomputing the indeces we will
162 do this in time. This means, though, that this happens for
163 every pass again. */
164 goto try_stack;
165 use_malloc = 1;
166 }
167 else
168 {
169 try_stack:
170 idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
171 rulearr = (unsigned char *) alloca (srclen + 1);
172 }
173
174 idxmax = 0;
175 do
176 {
f3a6cc0a 177 int32_t tmp = findidx (&usrc, -1);
ccadf7b5
UD
178 rulearr[idxmax] = tmp >> 24;
179 idxarr[idxmax] = tmp & 0xffffff;
180
181 ++idxmax;
182 }
183 while (*usrc != L('\0'));
184
185 /* This element is only read, the value never used but to determine
186 another value which then is ignored. */
187 rulearr[idxmax] = '\0';
188
189 /* Now the passes over the weights. We now use the indeces we found
190 before. */
191 needed = 0;
192 for (pass = 0; pass < nrules; ++pass)
193 {
194 size_t backw_stop = ~0ul;
195 int rule = rulesets[rulearr[0] * nrules + pass];
196 /* We assume that if a rule has defined `position' in one section
197 this is true for all of them. */
198 int position = rule & sort_position;
199
2f334ad5 200 last_needed = needed;
ccadf7b5
UD
201 if (position == 0)
202 {
203 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
204 {
205 if ((rule & sort_forward) != 0)
206 {
207 size_t len;
208
209 if (backw_stop != ~0ul)
210 {
211 /* Handle the pushed elements now. */
212 size_t backw;
213
bc3a45ce 214 for (backw = idxcnt; backw > backw_stop; )
ccadf7b5 215 {
bc3a45ce 216 --backw;
ccadf7b5
UD
217 len = weights[idxarr[backw]++];
218
219 if (needed + len < n)
220 while (len-- > 0)
221 dest[needed++] = weights[idxarr[backw]++];
222 else
223 {
224 /* No more characters fit into the buffer. */
225 needed += len;
226 idxarr[backw] += len;
227 }
228 }
229
230 backw_stop = ~0ul;
231 }
232
233 /* Now handle the forward element. */
234 len = weights[idxarr[idxcnt]++];
235 if (needed + len < n)
236 while (len-- > 0)
237 dest[needed++] = weights[idxarr[idxcnt]++];
238 else
239 {
240 /* No more characters fit into the buffer. */
241 needed += len;
242 idxarr[idxcnt] += len;
243 }
244 }
245 else
246 {
247 /* Remember where the backwards series started. */
248 if (backw_stop == ~0ul)
249 backw_stop = idxcnt;
250 }
251
252 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
253 }
254
255
256 if (backw_stop != ~0ul)
257 {
258 /* Handle the pushed elements now. */
259 size_t backw;
260
261 backw = idxcnt;
262 while (backw > backw_stop)
263 {
264 size_t len = weights[idxarr[--backw]++];
265
266 if (needed + len < n)
267 while (len-- > 0)
268 dest[needed++] = weights[idxarr[backw]++];
269 else
270 {
271 /* No more characters fit into the buffer. */
272 needed += len;
273 idxarr[backw] += len;
274 }
275 }
276 }
277 }
278 else
279 {
280 int val = 1;
281#ifndef WIDE_CHAR_VERSION
282 char buf[7];
283 size_t buflen;
284#endif
285 size_t i;
286
287 for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
288 {
289 if ((rule & sort_forward) != 0)
290 {
291 size_t len;
292
293 if (backw_stop != ~0ul)
294 {
295 /* Handle the pushed elements now. */
296 size_t backw;
297
bc3a45ce 298 for (backw = idxcnt; backw > backw_stop; )
ccadf7b5 299 {
bc3a45ce 300 --backw;
ccadf7b5
UD
301 len = weights[idxarr[backw]++];
302 if (len != 0)
303 {
304#ifdef WIDE_CHAR_VERSION
305 if (needed + 1 + len < n)
306 {
307 dest[needed] = val;
308 for (i = 0; i < len; ++i)
309 dest[needed + 1 + i] =
310 weights[idxarr[backw] + i];
311 }
312 needed += 1 + len;
313#else
314 buflen = utf8_encode (buf, val);
315 if (needed + buflen + len < n)
316 {
317 for (i = 0; i < buflen; ++i)
318 dest[needed + i] = buf[i];
319 for (i = 0; i < len; ++i)
320 dest[needed + buflen + i] =
321 weights[idxarr[backw] + i];
322 }
323 needed += buflen + len;
324#endif
325 idxarr[backw] += len;
326 val = 1;
327 }
328 else
329 ++val;
330 }
331
332 backw_stop = ~0ul;
333 }
334
335 /* Now handle the forward element. */
336 len = weights[idxarr[idxcnt]++];
337 if (len != 0)
338 {
339#ifdef WIDE_CHAR_VERSION
340 if (needed + 1+ len < n)
341 {
342 dest[needed] = val;
343 for (i = 0; i < len; ++i)
344 dest[needed + 1 + i] =
345 weights[idxarr[idxcnt] + i];
346 }
347 needed += 1 + len;
348#else
349 buflen = utf8_encode (buf, val);
350 if (needed + buflen + len < n)
351 {
352 for (i = 0; i < buflen; ++i)
353 dest[needed + i] = buf[i];
354 for (i = 0; i < len; ++i)
355 dest[needed + buflen + i] =
356 weights[idxarr[idxcnt] + i];
357 }
358 needed += buflen + len;
359#endif
360 idxarr[idxcnt] += len;
361 val = 1;
362 }
363 else
364 /* Note that we don't have to increment `idxarr[idxcnt]'
365 since the length is zero. */
366 ++val;
367 }
368 else
369 {
370 /* Remember where the backwards series started. */
371 if (backw_stop == ~0ul)
372 backw_stop = idxcnt;
373 }
374
375 rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
376 }
377
378 if (backw_stop != ~0ul)
379 {
380 /* Handle the pushed elements now. */
381 size_t backw;
382
383 backw = idxmax - 1;
384 while (backw > backw_stop)
385 {
386 size_t len = weights[idxarr[--backw]++];
387 if (len != 0)
388 {
389#ifdef WIDE_CHAR_VERSION
390 if (needed + 1 + len < n)
391 {
392 dest[needed] = val;
393 for (i = 0; i < len; ++i)
394 dest[needed + 1 + i] =
395 weights[idxarr[backw] + i];
396 }
397 needed += 1 + len;
398#else
399 buflen = utf8_encode (buf, val);
400 if (needed + buflen + len < n)
401 {
402 for (i = 0; i < buflen; ++i)
403 dest[needed + i] = buf[i];
404 for (i = 0; i < len; ++i)
405 dest[needed + buflen + i] =
406 weights[idxarr[backw] + i];
407 }
408 needed += buflen + len;
409#endif
410 idxarr[backw] += len;
411 val = 1;
412 }
413 else
414 ++val;
415 }
416 }
417 }
418
419 /* Finally store the byte to separate the passes or terminate
420 the string. */
421 if (needed < n)
422 dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
423 ++needed;
424 }
425
426 /* This is a little optimization: many collation specifications have
427 a `position' rule at the end and if no non-ignored character
428 is found the last \1 byte is immediately followed by a \0 byte
429 signalling this. We can avoid the \1 byte(s). */
2f334ad5 430 if (needed > 2 && needed == last_needed + 1)
ccadf7b5
UD
431 {
432 /* Remove the \1 byte. */
52a33795 433 if (--needed <= n)
2f334ad5 434 dest[needed - 1] = L('\0');
ccadf7b5
UD
435 }
436
437 /* Free the memory if needed. */
438 if (use_malloc)
439 free (idxarr);
440
441 /* Return the number of bytes/words we need, but don't count the NUL
442 byte/word at the end. */
443 return needed - 1;
444}
445libc_hidden_def (STRXFRM)
446
447#ifndef WIDE_CHAR_VERSION
1ab62b32 448weak_alias (__strxfrm_l, strxfrm_l)
ccadf7b5 449#endif