string/strxfrm.c

   1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #include <assert.h>
  21 #include <langinfo.h>
  22 #include <stddef.h>
  23 #include <stdint.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <sys/param.h>
  27
  28 #ifndef STRING_TYPE
  29 # define STRING_TYPE char
  30 # define USTRING_TYPE unsigned char
  31 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
  32 #  define STRXFRM __strxfrm_l
  33 # else
  34 #  define STRXFRM strxfrm
  35 # endif
  36 # define STRCMP strcmp
  37 # define STRLEN strlen
  38 # define STPNCPY __stpncpy
  39 # define WEIGHT_H "../locale/weight.h"
  40 # define SUFFIX MB
  41 # define L(arg) arg
  42 #endif
  43
  44 #define CONCAT(a,b) CONCAT1(a,b)
  45 #define CONCAT1(a,b) a##b
  46
  47 #include "../locale/localeinfo.h"
  48
  49
  50 #ifndef WIDE_CHAR_VERSION
  51
  52 /* We need UTF-8 encoding of numbers.  */
  53 static inline int
  54 utf8_encode (char *buf, int val)
  55 {
  56   int retval;
  57
  58   if (val < 0x80)
  59     {
  60       *buf++ = (char) val;
  61       retval = 1;
  62     }
  63   else
  64     {
  65       int step;
  66
  67       for (step = 2; step < 6; ++step)
  68         if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
  69           break;
  70       retval = step;
  71
  72       *buf = (unsigned char) (~0xff >> step);
  73       --step;
  74       do
  75         {
  76           buf[step] = 0x80 | (val & 0x3f);
  77           val >>= 6;
  78         }
  79       while (--step > 0);
  80       *buf |= val;
  81     }
  82
  83   return retval;
  84 }
  85 #endif
  86
  87
  88 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
  89 size_t
  90 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
  91 #else
  92 size_t
  93 STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
  94 #endif
  95 {
  96 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
  97   struct locale_data *current = l->__locales[LC_COLLATE];
  98   uint_fast32_t nrules = *((const uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
  99 #else
 100   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 101 #endif
 102   /* We don't assign the following values right away since it might be
 103      unnecessary in case there are no rules.  */
 104   const unsigned char *rulesets;
 105   const int32_t *table;
 106   const USTRING_TYPE *weights;
 107   const USTRING_TYPE *extra;
 108   const int32_t *indirect;
 109   uint_fast32_t pass;
 110   size_t needed;
 111   const USTRING_TYPE *usrc;
 112   size_t srclen = STRLEN (src);
 113   int32_t *idxarr;
 114   unsigned char *rulearr;
 115   size_t idxmax;
 116   size_t idxcnt;
 117   int use_malloc;
 118
 119 #include WEIGHT_H
 120
 121   if (nrules == 0)
 122     {
 123       if (n != 0)
 124         STPNCPY (dest, src, MIN (srclen + 1, n));
 125
 126       return srclen;
 127     }
 128
 129 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
 130   rulesets = (const unsigned char *)
 131     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
 132   table = (const int32_t *)
 133     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
 134   weights = (const USTRING_TYPE *)
 135     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
 136   extra = (const USTRING_TYPE *)
 137     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
 138   indirect = (const int32_t *)
 139     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
 140 #else
 141   rulesets = (const unsigned char *)
 142     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
 143   table = (const int32_t *)
 144     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
 145   weights = (const USTRING_TYPE *)
 146     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
 147   extra = (const USTRING_TYPE *)
 148     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
 149   indirect = (const int32_t *)
 150     _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
 151 #endif
 152   use_malloc = 0;
 153
 154   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
 155   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
 156   assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
 157   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 158
 159   /* Handle an empty string as a special case.  */
 160   if (srclen == 0)
 161     {
 162       if (n != 0)
 163         *dest = L('\0');
 164       return 1;
 165     }
 166
 167   /* We need the elements of the string as unsigned values since they
 168      are used as indeces.  */
 169   usrc = (const USTRING_TYPE *) src;
 170
 171   /* Perform the first pass over the string and while doing this find
 172      and store the weights for each character.  Since we want this to
 173      be as fast as possible we are using `alloca' to store the temporary
 174      values.  But since there is no limit on the length of the string
 175      we have to use `malloc' if the string is too long.  We should be
 176      very conservative here.  */
 177   if (srclen >= 16384)
 178     {
 179       idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
 180       rulearr = (unsigned char *) &idxarr[srclen];
 181
 182       if (idxarr == NULL)
 183         /* No memory.  Well, go with the stack then.
 184
 185            XXX Once this implementation is stable we will handle this
 186            differently.  Instead of precomputing the indeces we will
 187            do this in time.  This means, though, that this happens for
 188            every pass again.  */
 189         goto try_stack;
 190       use_malloc = 1;
 191     }
 192   else
 193     {
 194     try_stack:
 195       idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
 196       rulearr = (unsigned char *) alloca (srclen);
 197     }
 198
 199   idxmax = 0;
 200   do
 201     {
 202       int32_t tmp = findidx (&usrc);
 203       rulearr[idxmax] = tmp >> 24;
 204       idxarr[idxmax] = tmp & 0xffffff;
 205
 206       ++idxmax;
 207     }
 208   while (*usrc != L('\0'));
 209
 210   /* Now the passes over the weights.  We now use the indeces we found
 211      before.  */
 212   needed = 0;
 213   for (pass = 0; pass < nrules; ++pass)
 214     {
 215       size_t backw_stop = ~0ul;
 216       int rule = rulesets[rulearr[0] * nrules + pass];
 217       /* We assume that if a rule has defined `position' in one section
 218          this is true for all of them.  */
 219       int position = rule & sort_position;
 220
 221       if (position == 0)
 222         {
 223           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 224             {
 225               if ((rule & sort_forward) != 0)
 226                 {
 227                   size_t len;
 228
 229                   if (backw_stop != ~0ul)
 230                     {
 231                       /* Handle the pushed elements now.  */
 232                       size_t backw;
 233
 234                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 235                         {
 236                           len = weights[idxarr[backw]++];
 237
 238                           if (needed + len < n)
 239                             while (len-- > 0)
 240                               dest[needed++] = weights[idxarr[backw]++];
 241                           else
 242                             {
 243                                 /* No more characters fit into the buffer.  */
 244                               needed += len;
 245                               idxarr[backw] += len;
 246                             }
 247                         }
 248
 249                       backw_stop = ~0ul;
 250                     }
 251
 252                   /* Now handle the forward element.  */
 253                   len = weights[idxarr[idxcnt]++];
 254                   if (needed + len < n)
 255                     while (len-- > 0)
 256                       dest[needed++] = weights[idxarr[idxcnt]++];
 257                   else
 258                     {
 259                       /* No more characters fit into the buffer.  */
 260                       needed += len;
 261                       idxarr[idxcnt] += len;
 262                     }
 263                 }
 264               else
 265                 {
 266                   /* Remember where the backwards series started.  */
 267                   if (backw_stop == ~0ul)
 268                     backw_stop = idxcnt;
 269                 }
 270
 271               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 272             }
 273
 274
 275           if (backw_stop != ~0ul)
 276             {
 277               /* Handle the pushed elements now.  */
 278               size_t backw;
 279
 280               backw = idxcnt;
 281               while (backw > backw_stop)
 282                 {
 283                   size_t len = weights[idxarr[--backw]++];
 284
 285                   if (needed + len < n)
 286                     while (len-- > 0)
 287                       dest[needed++] = weights[idxarr[backw]++];
 288                   else
 289                     {
 290                       /* No more characters fit into the buffer.  */
 291                       needed += len;
 292                       idxarr[backw] += len;
 293                     }
 294                 }
 295             }
 296         }
 297       else
 298         {
 299           int val = 1;
 300 #ifndef WIDE_CHAR_VERSION
 301           char buf[7];
 302           size_t buflen;
 303 #endif
 304           size_t i;
 305
 306           for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
 307             {
 308               if ((rule & sort_forward) != 0)
 309                 {
 310                   size_t len;
 311
 312                   if (backw_stop != ~0ul)
 313                     {
 314                      /* Handle the pushed elements now.  */
 315                       size_t backw;
 316
 317                       for (backw = idxcnt - 1; backw >= backw_stop; --backw)
 318                         {
 319                           len = weights[idxarr[backw]++];
 320                           if (len != 0)
 321                             {
 322 #ifdef WIDE_CHAR_VERSION
 323                               if (needed + 1 + len < n)
 324                                 {
 325                                   dest[needed] = val;
 326                                   for (i = 0; i < len; ++i)
 327                                     dest[needed + 1 + i] =
 328                                       weights[idxarr[backw] + i];
 329                                 }
 330                               needed += 1 + len;
 331 #else
 332                               buflen = utf8_encode (buf, val);
 333                               if (needed + buflen + len < n)
 334                                 {
 335                                   for (i = 0; i < buflen; ++i)
 336                                     dest[needed + i] = buf[i];
 337                                   for (i = 0; i < len; ++i)
 338                                     dest[needed + buflen + i] =
 339                                       weights[idxarr[backw] + i];
 340                                 }
 341                               needed += buflen + len;
 342 #endif
 343                               idxarr[backw] += len;
 344                               val = 1;
 345                             }
 346                           else
 347                             ++val;
 348                         }
 349
 350                       backw_stop = ~0ul;
 351                     }
 352
 353                   /* Now handle the forward element.  */
 354                   len = weights[idxarr[idxcnt]++];
 355                   if (len != 0)
 356                     {
 357 #ifdef WIDE_CHAR_VERSION
 358                       if (needed + 1+ len < n)
 359                         {
 360                           dest[needed] = val;
 361                           for (i = 0; i < len; ++i)
 362                             dest[needed + 1 + i] =
 363                               weights[idxarr[idxcnt] + i];
 364                         }
 365                       needed += 1 + len;
 366 #else
 367                       buflen = utf8_encode (buf, val);
 368                       if (needed + buflen + len < n)
 369                         {
 370                           for (i = 0; i < buflen; ++i)
 371                             dest[needed + i] = buf[i];
 372                           for (i = 0; i < len; ++i)
 373                             dest[needed + buflen + i] =
 374                               weights[idxarr[idxcnt] + i];
 375                         }
 376                       needed += buflen + len;
 377 #endif
 378                       idxarr[idxcnt] += len;
 379                       val = 1;
 380                     }
 381                   else
 382                     /* Note that we don't have to increment `idxarr[idxcnt]'
 383                        since the length is zero.  */
 384                     ++val;
 385                 }
 386               else
 387                 {
 388                   /* Remember where the backwards series started.  */
 389                   if (backw_stop == ~0ul)
 390                     backw_stop = idxcnt;
 391                 }
 392
 393               rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
 394             }
 395
 396           if (backw_stop != ~0ul)
 397             {
 398               /* Handle the pushed elements now.  */
 399               size_t backw;
 400
 401               backw = idxmax - 1;
 402               while (backw > backw_stop)
 403                 {
 404                   size_t len = weights[idxarr[--backw]++];
 405                   if (len != 0)
 406                     {
 407 #ifdef WIDE_CHAR_VERSION
 408                       if (needed + 1 + len < n)
 409                         {
 410                           dest[needed] = val;
 411                           for (i = 0; i < len; ++i)
 412                             dest[needed + 1 + i] =
 413                               weights[idxarr[backw] + i];
 414                         }
 415                       needed += 1 + len;
 416 #else
 417                       buflen = utf8_encode (buf, val);
 418                       if (needed + buflen + len < n)
 419                         {
 420                           for (i = 0; i < buflen; ++i)
 421                             dest[needed + i] = buf[i];
 422                           for (i = 0; i < len; ++i)
 423                             dest[needed + buflen + i] =
 424                               weights[idxarr[backw] + i];
 425                         }
 426                       needed += buflen + len;
 427 #endif
 428                       idxarr[backw] += len;
 429                       val = 1;
 430                     }
 431                   else
 432                     ++val;
 433                 }
 434             }
 435         }
 436
 437       /* Finally store the byte to separate the passes or terminate
 438          the string.  */
 439       if (needed < n)
 440         dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
 441       ++needed;
 442     }
 443
 444   /* This is a little optimization: many collation specifications have
 445      a `position' rule at the end and if no non-ignored character
 446      is found the last \1 byte is immediately followed by a \0 byte
 447      signalling this.  We can avoid the \1 byte(s).  */
 448   if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
 449     {
 450       /* Remove the \1 byte.  */
 451       --needed;
 452       dest[needed - 1] = L('\0');
 453     }
 454
 455   /* Free the memory if needed.  */
 456   if (use_malloc)
 457     free (idxarr);
 458
 459   /* Return the number of bytes/words we need, but don't count the NUL
 460      byte/word at the end.  */
 461   return needed - 1;
 462 }