]>
git.ipfire.org Git - thirdparty/glibc.git/blob - string/strxfrm.c
1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
26 #include <sys/param.h>
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
32 # define STRXFRM __strxfrm_l
34 # define STRXFRM strxfrm
36 # define STRCMP strcmp
37 # define STRLEN strlen
38 # define STPNCPY __stpncpy
39 # define WEIGHT_H "../locale/weight.h"
44 #define CONCAT(a,b) CONCAT1(a,b)
45 #define CONCAT1(a,b) a##b
47 #include "../locale/localeinfo.h"
50 #ifndef WIDE_CHAR_VERSION
52 /* We need UTF-8 encoding of numbers. */
54 utf8_encode (char *buf
, int val
)
67 for (step
= 2; step
< 6; ++step
)
68 if ((val
& (~(uint32_t)0 << (5 * step
+ 1))) == 0)
72 *buf
= (unsigned char) (~0xff >> step
);
76 buf
[step
] = 0x80 | (val
& 0x3f);
88 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
90 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
)
93 STRXFRM (STRING_TYPE
*dest
, const STRING_TYPE
*src
, size_t n
, __locale_t l
)
96 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
97 struct locale_data
*current
= l
->__locales
[LC_COLLATE
];
98 uint_fast32_t nrules
= *((const uint32_t *) current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].string
);
100 uint32_t nrules
= _NL_CURRENT_WORD (LC_COLLATE
, _NL_COLLATE_NRULES
);
102 /* We don't assign the following values right away since it might be
103 unnecessary in case there are no rules. */
104 const unsigned char *rulesets
;
105 const int32_t *table
;
106 const USTRING_TYPE
*weights
;
107 const USTRING_TYPE
*extra
;
108 const int32_t *indirect
;
111 const USTRING_TYPE
*usrc
;
112 size_t srclen
= STRLEN (src
);
114 unsigned char *rulearr
;
124 STPNCPY (dest
, src
, MIN (srclen
+ 1, n
));
129 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
130 rulesets
= (const unsigned char *)
131 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
132 table
= (const int32_t *)
133 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
134 weights
= (const USTRING_TYPE
*)
135 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
136 extra
= (const USTRING_TYPE
*)
137 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
138 indirect
= (const int32_t *)
139 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
141 rulesets
= (const unsigned char *)
142 _NL_CURRENT (LC_COLLATE
, _NL_COLLATE_RULESETS
);
143 table
= (const int32_t *)
144 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_TABLE
,SUFFIX
));
145 weights
= (const USTRING_TYPE
*)
146 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
));
147 extra
= (const USTRING_TYPE
*)
148 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
));
149 indirect
= (const int32_t *)
150 _NL_CURRENT (LC_COLLATE
, CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
));
154 assert (((uintptr_t) table
) % __alignof__ (table
[0]) == 0);
155 assert (((uintptr_t) weights
) % __alignof__ (weights
[0]) == 0);
156 assert (((uintptr_t) extra
) % __alignof__ (extra
[0]) == 0);
157 assert (((uintptr_t) indirect
) % __alignof__ (indirect
[0]) == 0);
159 /* Handle an empty string as a special case. */
167 /* We need the elements of the string as unsigned values since they
168 are used as indeces. */
169 usrc
= (const USTRING_TYPE
*) src
;
171 /* Perform the first pass over the string and while doing this find
172 and store the weights for each character. Since we want this to
173 be as fast as possible we are using `alloca' to store the temporary
174 values. But since there is no limit on the length of the string
175 we have to use `malloc' if the string is too long. We should be
176 very conservative here. */
179 idxarr
= (int32_t *) malloc (srclen
* (sizeof (int32_t) + 1));
180 rulearr
= (unsigned char *) &idxarr
[srclen
];
183 /* No memory. Well, go with the stack then.
185 XXX Once this implementation is stable we will handle this
186 differently. Instead of precomputing the indeces we will
187 do this in time. This means, though, that this happens for
195 idxarr
= (int32_t *) alloca (srclen
* sizeof (int32_t));
196 rulearr
= (unsigned char *) alloca (srclen
);
202 int32_t tmp
= findidx (&usrc
);
203 rulearr
[idxmax
] = tmp
>> 24;
204 idxarr
[idxmax
] = tmp
& 0xffffff;
208 while (*usrc
!= L('\0'));
210 /* Now the passes over the weights. We now use the indeces we found
213 for (pass
= 0; pass
< nrules
; ++pass
)
215 size_t backw_stop
= ~0ul;
216 int rule
= rulesets
[rulearr
[0] * nrules
+ pass
];
217 /* We assume that if a rule has defined `position' in one section
218 this is true for all of them. */
219 int position
= rule
& sort_position
;
223 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
225 if ((rule
& sort_forward
) != 0)
229 if (backw_stop
!= ~0ul)
231 /* Handle the pushed elements now. */
234 for (backw
= idxcnt
- 1; backw
>= backw_stop
; --backw
)
236 len
= weights
[idxarr
[backw
]++];
238 if (needed
+ len
< n
)
240 dest
[needed
++] = weights
[idxarr
[backw
]++];
243 /* No more characters fit into the buffer. */
245 idxarr
[backw
] += len
;
252 /* Now handle the forward element. */
253 len
= weights
[idxarr
[idxcnt
]++];
254 if (needed
+ len
< n
)
256 dest
[needed
++] = weights
[idxarr
[idxcnt
]++];
259 /* No more characters fit into the buffer. */
261 idxarr
[idxcnt
] += len
;
266 /* Remember where the backwards series started. */
267 if (backw_stop
== ~0ul)
271 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
275 if (backw_stop
!= ~0ul)
277 /* Handle the pushed elements now. */
281 while (backw
> backw_stop
)
283 size_t len
= weights
[idxarr
[--backw
]++];
285 if (needed
+ len
< n
)
287 dest
[needed
++] = weights
[idxarr
[backw
]++];
290 /* No more characters fit into the buffer. */
292 idxarr
[backw
] += len
;
300 #ifndef WIDE_CHAR_VERSION
306 for (idxcnt
= 0; idxcnt
< idxmax
; ++idxcnt
)
308 if ((rule
& sort_forward
) != 0)
312 if (backw_stop
!= ~0ul)
314 /* Handle the pushed elements now. */
317 for (backw
= idxcnt
- 1; backw
>= backw_stop
; --backw
)
319 len
= weights
[idxarr
[backw
]++];
322 #ifdef WIDE_CHAR_VERSION
323 if (needed
+ 1 + len
< n
)
326 for (i
= 0; i
< len
; ++i
)
327 dest
[needed
+ 1 + i
] =
328 weights
[idxarr
[backw
] + i
];
332 buflen
= utf8_encode (buf
, val
);
333 if (needed
+ buflen
+ len
< n
)
335 for (i
= 0; i
< buflen
; ++i
)
336 dest
[needed
+ i
] = buf
[i
];
337 for (i
= 0; i
< len
; ++i
)
338 dest
[needed
+ buflen
+ i
] =
339 weights
[idxarr
[backw
] + i
];
341 needed
+= buflen
+ len
;
343 idxarr
[backw
] += len
;
353 /* Now handle the forward element. */
354 len
= weights
[idxarr
[idxcnt
]++];
357 #ifdef WIDE_CHAR_VERSION
358 if (needed
+ 1+ len
< n
)
361 for (i
= 0; i
< len
; ++i
)
362 dest
[needed
+ 1 + i
] =
363 weights
[idxarr
[idxcnt
] + i
];
367 buflen
= utf8_encode (buf
, val
);
368 if (needed
+ buflen
+ len
< n
)
370 for (i
= 0; i
< buflen
; ++i
)
371 dest
[needed
+ i
] = buf
[i
];
372 for (i
= 0; i
< len
; ++i
)
373 dest
[needed
+ buflen
+ i
] =
374 weights
[idxarr
[idxcnt
] + i
];
376 needed
+= buflen
+ len
;
378 idxarr
[idxcnt
] += len
;
382 /* Note that we don't have to increment `idxarr[idxcnt]'
383 since the length is zero. */
388 /* Remember where the backwards series started. */
389 if (backw_stop
== ~0ul)
393 rule
= rulesets
[rulearr
[idxcnt
+ 1] * nrules
+ pass
];
396 if (backw_stop
!= ~0ul)
398 /* Handle the pushed elements now. */
402 while (backw
> backw_stop
)
404 size_t len
= weights
[idxarr
[--backw
]++];
407 #ifdef WIDE_CHAR_VERSION
408 if (needed
+ 1 + len
< n
)
411 for (i
= 0; i
< len
; ++i
)
412 dest
[needed
+ 1 + i
] =
413 weights
[idxarr
[backw
] + i
];
417 buflen
= utf8_encode (buf
, val
);
418 if (needed
+ buflen
+ len
< n
)
420 for (i
= 0; i
< buflen
; ++i
)
421 dest
[needed
+ i
] = buf
[i
];
422 for (i
= 0; i
< len
; ++i
)
423 dest
[needed
+ buflen
+ i
] =
424 weights
[idxarr
[backw
] + i
];
426 needed
+= buflen
+ len
;
428 idxarr
[backw
] += len
;
437 /* Finally store the byte to separate the passes or terminate
440 dest
[needed
] = pass
+ 1 < nrules
? L('\1') : L('\0');
444 /* This is a little optimization: many collation specifications have
445 a `position' rule at the end and if no non-ignored character
446 is found the last \1 byte is immediately followed by a \0 byte
447 signalling this. We can avoid the \1 byte(s). */
448 if (needed
<= n
&& needed
> 2 && dest
[needed
- 2] == L('\1'))
450 /* Remove the \1 byte. */
452 dest
[needed
- 1] = L('\0');
455 /* Free the memory if needed. */
459 /* Return the number of bytes/words we need, but don't count the NUL
460 byte/word at the end. */