string/strxfrm.c

   1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #include <stddef.h>
  21 #include <stdlib.h>
  22 #include <string.h>
  23
  24 #ifndef WIDE_VERSION
  25 # define STRING_TYPE char
  26 # define USTRING_TYPE unsigned char
  27 # define L_(Ch) Ch
  28 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
  29 #  define STRXFRM __strxfrm_l
  30 # else
  31 #  define STRXFRM strxfrm
  32 # endif
  33 # define STRLEN strlen
  34 # define STPNCPY __stpncpy
  35 #endif
  36
  37 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
  38 size_t
  39 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
  40 #else
  41 size_t
  42 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  43 #endif
  44 {
  45       if (n != 0)
  46         STPNCPY (dest, src, n);
  47
  48       return STRLEN (src);
  49 }
  50
  51 #if 0
  52 /* Include the shared helper functions.  `strxfrm'/`wcsxfrm' also use
  53    these functions.  */
  54 #include "../locale/weight.h"
  55
  56
  57 #ifndef WIDE_VERSION
  58 /* Write 32 bit value UTF-8 encoded but only if enough space is left.  */
  59 static __inline size_t
  60 print_val (u_int32_t value, char *dest, size_t max, size_t act)
  61 {
  62   char tmp[6];
  63   int idx = 0;
  64
  65   if (value < 0x80)
  66     tmp[idx++] = (char) value;
  67   else
  68     {
  69       tmp[idx++] = '\x80' + (char) (value & 0x3f);
  70       value >>= 6;
  71
  72       if (value < 0x20)
  73         tmp[idx++] = '\xc0' + (char) value;
  74       else
  75         {
  76           tmp[idx++] = '\x80' + (char) (value & 0x3f);
  77           value >>= 6;
  78
  79           if (value < 0x10)
  80             tmp[idx++] = '\xe0' + (char) value;
  81           else
  82             {
  83               tmp[idx++] = '\x80' + (char) (value & 0x3f);
  84               value >>= 6;
  85
  86               if (value < 0x08)
  87                 tmp[idx++] = '\xf0' + (char) value;
  88               else
  89                 {
  90                   tmp[idx++] = '\x80' + (char) (value & 0x3f);
  91                   value >>= 6;
  92
  93                   if (value < 0x04)
  94                     tmp[idx++] = '\xf8' + (char) value;
  95                   else
  96                     {
  97                       tmp[idx++] = '\x80' + (char) (value & 0x3f);
  98                       tmp[idx++] = '\xfc' + (char) (value >> 6);
  99                     }
 100                 }
 101             }
 102         }
 103     }
 104
 105   while (idx-- > 0)
 106     {
 107       if (act < max)
 108         dest[act] = tmp[idx];
 109       ++act;
 110     }
 111
 112   return act;
 113 }
 114 #else
 115 static __inline size_t
 116 print_val (u_int32_t value, wchar_t *dest, size_t max, size_t act)
 117 {
 118   /* We cannot really assume wchar_t is 32 bits wide.  But it is for
 119      GCC and so we don't do much optimization for the other case.  */
 120   if (sizeof (wchar_t) == 4)
 121     {
 122       if (act < max)
 123         dest[act] = (wchar_t) value;
 124       ++act;
 125     }
 126   else
 127     {
 128       wchar_t tmp[3];
 129       size_t idx = 0;
 130
 131       if (value < 0x8000)
 132         tmp[idx++] = (wchar_t) act;
 133       else
 134         {
 135           tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
 136           value >>= 14;
 137           if (value < 0x2000)
 138             tmp[idx++] = (wchar_t) (0xc000 + value);
 139           else
 140             {
 141               tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
 142               value >>= 14;
 143               tmp[idx++] = (wchar_t) (0xe000 + value);
 144             }
 145         }
 146       while (idx-- > 0)
 147         {
 148           if (act < max)
 149             dest[act] = tmp[idx];
 150           ++act;
 151         }
 152     }
 153   return act;
 154 }
 155 #endif
 156
 157
 158 /* Transform SRC into a form such that the result of strcmp
 159    on two strings that have been transformed by strxfrm is
 160    the same as the result of strcoll on the two strings before
 161    their transformation.  The transformed string is put in at
 162    most N characters of DEST and its length is returned.  */
 163 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
 164 size_t
 165 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
 166 #else
 167 size_t
 168 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
 169 #endif
 170 {
 171 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
 172   struct locale_data *current = l->__locales[LC_COLLATE];
 173 # if BYTE_ORDER == BIG_ENDIAN
 174   const u_int32_t *collate_table = (const u_int32_t *)
 175     current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].string;
 176   const u_int32_t *collate_extra = (const u_int32_t *)
 177     current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].string;
 178 # elif BYTE_ORDER == LITTLE_ENDIAN
 179   const u_int32_t *collate_table = (const u_int32_t *)
 180     current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].string;
 181   const u_int32_t *collate_extra = (const u_int32_t *)
 182     current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].string;
 183 # else
 184 #  error bizarre byte order
 185 # endif
 186 #endif
 187   weight_t *forw = NULL;
 188   weight_t *backw = NULL;
 189   size_t pass;
 190   size_t written;
 191
 192   /* If the current locale does not specify locale data we use normal
 193      8-bit string comparison.  */
 194   if (collate_nrules == 0)
 195     {
 196       if (n != 0)
 197         STPNCPY (dest, src, n);
 198
 199       return STRLEN (src);
 200     }
 201
 202   /* Handle an empty string as a special case.  */
 203   if (*src == '\0')
 204     {
 205       if (n != 0)
 206         *dest = '\0';
 207       return 1;
 208     }
 209
 210   /* Get full information about the string.  This means we get
 211      information for all passes in a special data structure.  */
 212   get_string (src, forw, backw);
 213
 214   /* Now we have all the information.  In at most the given number of
 215      passes we can finally decide about the order.  */
 216   written = 0;
 217   for (pass = 0; pass < collate_nrules; ++pass)
 218     {
 219       int forward = (collate_rules[pass] & sort_forward) != 0;
 220       const weight_t *run = forward ? forw : backw;
 221       int idx = forward ? 0 : run->data[pass].number - 1;
 222
 223       while (1)
 224         {
 225           int ignore = 0;
 226           u_int32_t w = 0;
 227
 228           /* Here we have to check for IGNORE entries.  If these are
 229              found we count them and go on with he next value.  */
 230           while (run != NULL
 231                  && ((w = run->data[pass].value[idx])
 232                      == (u_int32_t) IGNORE_CHAR))
 233             {
 234               ++ignore;
 235               if (forward
 236                   ? ++idx >= run->data[pass].number
 237                   : --idx < 0)
 238                 {
 239                   weight_t *nextp = forward ? run->next : run->prev;
 240                   if (nextp == NULL)
 241                     {
 242                       w = 0;
 243                       /* No more non-INGOREd elements means lowest
 244                          possible value.  */
 245                       ignore = -1;
 246                     }
 247                   else
 248                     idx = forward ? 0 : nextp->data[pass].number - 1;
 249                   run = nextp;
 250                 }
 251             }
 252
 253           /* Stop if all characters are processed.  */
 254           if (run == NULL)
 255             break;
 256
 257           /* Now we have information of the number of ignored weights
 258              and the value of the next weight.  We have to add 2
 259              because 0 means EOS and 1 is the intermediate string end.  */
 260           if ((collate_rules[pass] & sort_position) != 0)
 261             written = print_val (ignore + 2, dest, n, written);
 262
 263           if (w != 0)
 264             written = print_val (w, dest, n, written);
 265
 266           /* We have to increment the index counters.  */
 267           if (forward)
 268             {
 269               if (++idx >= run->data[pass].number)
 270                 {
 271                   run = run->next;
 272                   idx = 0;
 273                 }
 274             }
 275           else
 276             {
 277               if (--idx < 0)
 278                 {
 279                   run = run->prev;
 280                   if (run != NULL)
 281                     idx = run->data[pass].number - 1;
 282                 }
 283             }
 284         }
 285
 286       /* Write marker for end of word.  */
 287       if (pass + 1 < collate_nrules)
 288         written = print_val (1, dest, n, written);
 289     }
 290
 291   /* Terminate string.  */
 292   if (written < n)
 293     dest[written] = L_('\0');
 294
 295   /* Return length without counting the terminating '\0'.  */
 296   return written;
 297 }
 298 #endif