[thirdparty/glibc.git] / string / strxfrm_l.c

/* Copyright (C) 1995-2024 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <assert.h>
#include <langinfo.h>
#include <locale.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>

#ifndef STRING_TYPE
# define STRING_TYPE char
# define USTRING_TYPE unsigned char
# define STRXFRM __strxfrm_l
# define STRLEN strlen
# define STPNCPY __stpncpy
# define WEIGHT_H "../locale/weight.h"
# define SUFFIX	MB
# define L(arg) arg
#endif

#define CONCAT(a,b) CONCAT1(a,b)
#define CONCAT1(a,b) a##b

/* Maximum string size that is calculated with cached indices.  Right now this
   is an arbitrary value open to optimizations.  SMALL_STR_SIZE * 4 has to be
   lower than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
#define SMALL_STR_SIZE 4095

#include "../locale/localeinfo.h"
#include WEIGHT_H

/* Group locale data for shorter parameter lists.  */
typedef struct
{
  uint32_t nrules;
  unsigned char *rulesets;
  USTRING_TYPE *weights;
  int32_t *table;
  USTRING_TYPE *extra;
  int32_t *indirect;
} locale_data_t;

#ifndef WIDE_CHAR_VERSION

/* We need UTF-8 encoding of numbers.  */
static int
utf8_encode (char *buf, int val)
{
  int retval;

  if (val < 0x80)
    {
      *buf++ = (char) val;
      retval = 1;
    }
  else
    {
      int step;

      for (step = 2; step < 6; ++step)
	if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
	  break;
      retval = step;

      *buf = (unsigned char) (~0xff >> step);
      --step;
      do
	{
	  buf[step] = 0x80 | (val & 0x3f);
	  val >>= 6;
	}
      while (--step > 0);
      *buf |= val;
    }

  return retval;
}
#endif

/* Find next weight and rule index.  Inlined since called for every char.  */
static __always_inline size_t
find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
	  unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
{
  int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
			 -1);
  *rule_idx = tmp >> 24;
  int32_t idx = tmp & 0xffffff;
  size_t len = l_data->weights[idx++];

  /* Skip over indices of previous levels.  */
  for (int i = 0; i < pass; i++)
    {
      idx += len;
      len = l_data->weights[idx++];
    }

  *weight_idx = idx;
  return len;
}

static int
find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
	       const int pass)
{
  int32_t weight_idx;
  unsigned char rule_idx;
  const USTRING_TYPE *usrc = us;

  find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
}

/* Do the transformation.  */
static size_t
do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
	 const locale_data_t *l_data)
{
  int32_t weight_idx;
  unsigned char rule_idx;
  uint32_t pass;
  size_t needed = 0;
  size_t last_needed;

  /* Now the passes over the weights.  */
  for (pass = 0; pass < l_data->nrules; ++pass)
    {
      size_t backw_len = 0;
      last_needed = needed;
      const USTRING_TYPE *cur = usrc;
      const USTRING_TYPE *backw_start = NULL;

       /* We assume that if a rule has defined `position' in one section
         this is true for all of them.  */
      int position = find_position (cur, l_data, pass);

      if (position == 0)
	{
	  while (*cur != L('\0'))
	    {
	      const USTRING_TYPE *pos = cur;
	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
				     pass);
	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];

	      if ((rule & sort_forward) != 0)
		{
		  /* Handle the pushed backward sequence.  */
		  if (backw_start != NULL)
		    {
		      for (size_t i = backw_len; i > 0; )
			{
			  int32_t weight_idx;
			  unsigned char rule_idx;
			  size_t len = find_idx (&backw_start, &weight_idx,
						 &rule_idx, l_data, pass);
			  if (needed + i < n)
			    for (size_t j = len; j > 0; j--)
			      dest[needed + i - j] =
				l_data->weights[weight_idx++];

			  i -= len;
			}

		      needed += backw_len;
		      backw_start = NULL;
		      backw_len = 0;
		    }

		  /* Now handle the forward element.  */
		  if (needed + len < n)
		    while (len-- > 0)
		      dest[needed++] = l_data->weights[weight_idx++];
		  else
		    /* No more characters fit into the buffer.  */
		    needed += len;
		}
	      else
		{
		  /* Remember start of the backward sequence & track length.  */
		  if (backw_start == NULL)
		    backw_start = pos;
		  backw_len += len;
		}
	    }


	  /* Handle the pushed backward sequence.  */
	  if (backw_start != NULL)
	    {
	      for (size_t i = backw_len; i > 0; )
		{
		  size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
					 l_data, pass);
		  if (needed + i < n)
		    for (size_t j = len; j > 0; j--)
		      dest[needed + i - j] =
			l_data->weights[weight_idx++];

		  i -= len;
		}

	      needed += backw_len;
	    }
	}
      else
	{
	  int val = 1;
#ifndef WIDE_CHAR_VERSION
	  char buf[7];
	  size_t buflen;
#endif
	  size_t i;

	  while (*cur != L('\0'))
	    {
	      const USTRING_TYPE *pos = cur;
	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
				     pass);
	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];

	      if ((rule & sort_forward) != 0)
		{
		  /* Handle the pushed backward sequence.  */
		  if (backw_start != NULL)
		    {
		      for (size_t p = backw_len; p > 0; p--)
			{
			  size_t len;
			  int32_t weight_idx;
			  unsigned char rule_idx;
			  const USTRING_TYPE *backw_cur = backw_start;

			  /* To prevent a warning init the used vars.  */
			  len = find_idx (&backw_cur, &weight_idx,
					  &rule_idx, l_data, pass);

			  for (i = 1; i < p; i++)
			    len = find_idx (&backw_cur, &weight_idx,
					    &rule_idx, l_data, pass);

			  if (len != 0)
			    {
#ifdef WIDE_CHAR_VERSION
			      if (needed + 1 + len < n)
				{
				  dest[needed] = val;
				  for (i = 0; i < len; ++i)
				    dest[needed + 1 + i] =
				      l_data->weights[weight_idx + i];
				}
			      needed += 1 + len;
#else
			      buflen = utf8_encode (buf, val);
			      if (needed + buflen + len < n)
				{
				  for (i = 0; i < buflen; ++i)
				    dest[needed + i] = buf[i];
				  for (i = 0; i < len; ++i)
				    dest[needed + buflen + i] =
				      l_data->weights[weight_idx + i];
				}
			      needed += buflen + len;
#endif
			      val = 1;
			    }
			  else
			    ++val;
			}

		      backw_start = NULL;
		      backw_len = 0;
		    }

		  /* Now handle the forward element.  */
		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1 + len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += buflen + len;
#endif
		      val = 1;
		    }
		  else
		    ++val;
		}
	      else
		{
		  /* Remember start of the backward sequence & track length.  */
		  if (backw_start == NULL)
		    backw_start = pos;
		  backw_len++;
		}
	    }

	  /* Handle the pushed backward sequence.  */
	  if (backw_start != NULL)
	    {
	      for (size_t p = backw_len; p > 0; p--)
		{
		  size_t len;
		  int32_t weight_idx;
		  unsigned char rule_idx;
		  const USTRING_TYPE *backw_cur = backw_start;

		  /* To prevent a warning init the used vars.  */
		  len = find_idx (&backw_cur, &weight_idx,
				  &rule_idx, l_data, pass);

		  for (i = 1; i < p; i++)
		    len = find_idx (&backw_cur, &weight_idx,
				    &rule_idx, l_data, pass);

		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1 + len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += buflen + len;
#endif
		      val = 1;
		    }
		  else
		    ++val;
		}
	    }
	}

      /* Finally store the byte to separate the passes or terminate
	 the string.  */
      if (needed < n)
	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
      ++needed;
    }

  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1)
    {
      /* Remove the \1 byte.  */
      if (--needed <= n)
	dest[needed - 1] = L('\0');
    }

  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
}

/* Do the transformation using weight-index and rule cache.  */
static size_t
do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
		size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
{
  uint32_t nrules = l_data->nrules;
  unsigned char *rulesets = l_data->rulesets;
  USTRING_TYPE *weights = l_data->weights;
  uint32_t pass;
  size_t needed = 0;
  size_t last_needed;
  size_t idxcnt;

  /* Now the passes over the weights.  */
  for (pass = 0; pass < nrules; ++pass)
    {
      size_t backw_stop = ~0ul;
      int rule = rulesets[rulearr[0] * nrules + pass];
      /* We assume that if a rule has defined `position' in one section
	 this is true for all of them.  */
      int position = rule & sort_position;

      last_needed = needed;
      if (position == 0)
	{
	  for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
	    {
	      if ((rule & sort_forward) != 0)
		{
		  size_t len;

		  if (backw_stop != ~0ul)
		    {
		      /* Handle the pushed elements now.  */
		      size_t backw;

		      for (backw = idxcnt; backw > backw_stop; )
			{
			  --backw;
			  len = weights[idxarr[backw]++];

			  if (needed + len < n)
			    while (len-- > 0)
			      dest[needed++] = weights[idxarr[backw]++];
			  else
			    {
				/* No more characters fit into the buffer.  */
			      needed += len;
			      idxarr[backw] += len;
			    }
			}

		      backw_stop = ~0ul;
		    }

		  /* Now handle the forward element.  */
		  len = weights[idxarr[idxcnt]++];
		  if (needed + len < n)
		    while (len-- > 0)
		      dest[needed++] = weights[idxarr[idxcnt]++];
		  else
		    {
		      /* No more characters fit into the buffer.  */
		      needed += len;
		      idxarr[idxcnt] += len;
		    }
		}
	      else
		{
		  /* Remember where the backwards series started.  */
		  if (backw_stop == ~0ul)
		    backw_stop = idxcnt;
		}

	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
	    }


	  if (backw_stop != ~0ul)
	    {
	      /* Handle the pushed elements now.  */
	      size_t backw;

	      backw = idxcnt;
	      while (backw > backw_stop)
		{
		  size_t len = weights[idxarr[--backw]++];

		  if (needed + len < n)
		    while (len-- > 0)
		      dest[needed++] = weights[idxarr[backw]++];
		  else
		    {
		      /* No more characters fit into the buffer.  */
		      needed += len;
		      idxarr[backw] += len;
		    }
		}
	    }
	}
      else
	{
	  int val = 1;
#ifndef WIDE_CHAR_VERSION
	  char buf[7];
	  size_t buflen;
#endif
	  size_t i;

	  for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
	    {
	      if ((rule & sort_forward) != 0)
		{
		  size_t len;

		  if (backw_stop != ~0ul)
		    {
		     /* Handle the pushed elements now.  */
		      size_t backw;

		      for (backw = idxcnt; backw > backw_stop; )
			{
			  --backw;
			  len = weights[idxarr[backw]++];
			  if (len != 0)
			    {
#ifdef WIDE_CHAR_VERSION
			      if (needed + 1 + len < n)
				{
				  dest[needed] = val;
				  for (i = 0; i < len; ++i)
				    dest[needed + 1 + i] =
				      weights[idxarr[backw] + i];
				}
			      needed += 1 + len;
#else
			      buflen = utf8_encode (buf, val);
			      if (needed + buflen + len < n)
				{
				  for (i = 0; i < buflen; ++i)
				    dest[needed + i] = buf[i];
				  for (i = 0; i < len; ++i)
				    dest[needed + buflen + i] =
				      weights[idxarr[backw] + i];
				}
			      needed += buflen + len;
#endif
			      idxarr[backw] += len;
			      val = 1;
			    }
			  else
			    ++val;
			}

		      backw_stop = ~0ul;
		    }

		  /* Now handle the forward element.  */
		  len = weights[idxarr[idxcnt]++];
		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1+ len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      weights[idxarr[idxcnt] + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      weights[idxarr[idxcnt] + i];
			}
		      needed += buflen + len;
#endif
		      idxarr[idxcnt] += len;
		      val = 1;
		    }
		  else
		    /* Note that we don't have to increment `idxarr[idxcnt]'
		       since the length is zero.  */
		    ++val;
		}
	      else
		{
		  /* Remember where the backwards series started.  */
		  if (backw_stop == ~0ul)
		    backw_stop = idxcnt;
		}

	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
	    }

	  if (backw_stop != ~0ul)
	    {
	      /* Handle the pushed elements now.  */
	      size_t backw;

	      backw = idxmax - 1;
	      while (backw > backw_stop)
		{
		  size_t len = weights[idxarr[--backw]++];
		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1 + len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      weights[idxarr[backw] + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      weights[idxarr[backw] + i];
			}
		      needed += buflen + len;
#endif
		      idxarr[backw] += len;
		      val = 1;
		    }
		  else
		    ++val;
		}
	    }
	}

      /* Finally store the byte to separate the passes or terminate
	 the string.  */
      if (needed < n)
	dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
      ++needed;
    }

  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1)
    {
      /* Remove the \1 byte.  */
      if (--needed <= n)
	dest[needed - 1] = L('\0');
    }

  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
}

size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, locale_t l)
{
  locale_data_t l_data;
  struct __locale_data *current = l->__locales[LC_COLLATE];
  l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;

  /* Handle byte comparison case.  */
  if (l_data.nrules == 0)
    {
      size_t srclen = STRLEN (src);

      if (n != 0)
	STPNCPY (dest, src, MIN (srclen + 1, n));

      return srclen;
    }

  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
  if (*src == L('\0'))
    {
      if (n != 0)
	*dest = L('\0');
      return 0;
    }

  /* Get the locale data.  */
  l_data.rulesets = (unsigned char *)
    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
  l_data.table = (int32_t *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
  l_data.weights = (USTRING_TYPE *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
  l_data.extra = (USTRING_TYPE *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
  l_data.indirect = (int32_t *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;

  assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
  assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
  assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
  assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);

  /* We need the elements of the string as unsigned values since they
     are used as indices.  */
  const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;

  /* Allocate cache for small strings on the stack and fill it with weight and
     rule indices.  If the cache size is not sufficient, continue with the
     uncached xfrm version.  */
  size_t idxmax = 0;
  const USTRING_TYPE *cur = usrc;
  int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
  unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);

  do
    {
      int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
			     -1);
      rulearr[idxmax] = tmp >> 24;
      idxarr[idxmax] = tmp & 0xffffff;

      ++idxmax;
    }
  while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);

  /* This element is only read, the value never used but to determine
     another value which then is ignored.  */
  rulearr[idxmax] = '\0';

  /* Do the transformation.  */
  if (*cur == L('\0'))
    return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
  else
    return do_xfrm (usrc, dest, n, &l_data);
}
libc_hidden_def (STRXFRM)

#ifndef WIDE_CHAR_VERSION
weak_alias (__strxfrm_l, strxfrm_l)
#endif
Commit	Line	Data
dff8da6b	1	/* Copyright (C) 1995-2024 Free Software Foundation, Inc.
c84142e8	2	This file is part of the GNU C Library.
c84142e8 UD	3
c84142e8 UD	4	The GNU C Library is free software; you can redistribute it and/or
41bdb6e2 AJ	5	modify it under the terms of the GNU Lesser General Public
	6	License as published by the Free Software Foundation; either
	7	version 2.1 of the License, or (at your option) any later version.
c84142e8 UD	8
	9	The GNU C Library is distributed in the hope that it will be useful,
	10	but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2	12	Lesser General Public License for more details.
c84142e8	13
41bdb6e2	14	You should have received a copy of the GNU Lesser General Public
59ba27a6	15	License along with the GNU C Library; if not, see
5a82c748	16	<https://www.gnu.org/licenses/>. */
c84142e8	17
ccadf7b5 UD	18	#include <assert.h>
	19	#include <langinfo.h>
	20	#include <locale.h>
	21	#include <stddef.h>
	22	#include <stdint.h>
	23	#include <stdlib.h>
	24	#include <string.h>
	25	#include <sys/param.h>
1ab62b32	26
ccadf7b5 UD	27	#ifndef STRING_TYPE
	28	# define STRING_TYPE char
	29	# define USTRING_TYPE unsigned char
	30	# define STRXFRM __strxfrm_l
ccadf7b5 UD	31	# define STRLEN strlen
	32	# define STPNCPY __stpncpy
	33	# define WEIGHT_H "../locale/weight.h"
	34	# define SUFFIX MB
	35	# define L(arg) arg
	36	#endif
	37
	38	#define CONCAT(a,b) CONCAT1(a,b)
	39	#define CONCAT1(a,b) a##b
	40
0f9e5854 LH	41	/* Maximum string size that is calculated with cached indices. Right now this
	42	is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
	43	lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
	44	#define SMALL_STR_SIZE 4095
	45
ccadf7b5	46	#include "../locale/localeinfo.h"
8c0ab919	47	#include WEIGHT_H
ccadf7b5	48
0f9e5854 LH	49	/* Group locale data for shorter parameter lists. */
	50	typedef struct
	51	{
535e935a	52	uint32_t nrules;
0f9e5854 LH	53	unsigned char *rulesets;
	54	USTRING_TYPE *weights;
	55	int32_t *table;
	56	USTRING_TYPE *extra;
	57	int32_t *indirect;
	58	} locale_data_t;
ccadf7b5 UD	59
	60	#ifndef WIDE_CHAR_VERSION
	61
	62	/* We need UTF-8 encoding of numbers. */
	63	static int
	64	utf8_encode (char *buf, int val)
	65	{
	66	int retval;
	67
	68	if (val < 0x80)
	69	{
	70	*buf++ = (char) val;
	71	retval = 1;
	72	}
	73	else
	74	{
	75	int step;
	76
	77	for (step = 2; step < 6; ++step)
	78	if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
	79	break;
	80	retval = step;
	81
	82	*buf = (unsigned char) (~0xff >> step);
	83	--step;
	84	do
	85	{
	86	buf[step] = 0x80 \| (val & 0x3f);
	87	val >>= 6;
	88	}
	89	while (--step > 0);
	90	*buf \|= val;
	91	}
	92
	93	return retval;
	94	}
	95	#endif
	96
0f9e5854 LH	97	/* Find next weight and rule index. Inlined since called for every char. */
	98	static __always_inline size_t
	99	find_idx (const USTRING_TYPE *us, int32_t weight_idx,
	100	unsigned char rule_idx, const locale_data_t l_data, const int pass)
	101	{
	102	int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
	103	-1);
	104	*rule_idx = tmp >> 24;
	105	int32_t idx = tmp & 0xffffff;
	106	size_t len = l_data->weights[idx++];
	107
	108	/* Skip over indices of previous levels. */
	109	for (int i = 0; i < pass; i++)
	110	{
	111	idx += len;
	112	len = l_data->weights[idx++];
	113	}
ccadf7b5	114
0f9e5854 LH	115	*weight_idx = idx;
	116	return len;
	117	}
	118
	119	static int
	120	find_position (const USTRING_TYPE us, const locale_data_t l_data,
	121	const int pass)
ccadf7b5	122	{
0f9e5854 LH	123	int32_t weight_idx;
	124	unsigned char rule_idx;
	125	const USTRING_TYPE *usrc = us;
	126
	127	find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
	128	return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
	129	}
	130
	131	/* Do the transformation. */
	132	static size_t
	133	do_xfrm (const USTRING_TYPE usrc, STRING_TYPE dest, size_t n,
	134	const locale_data_t *l_data)
	135	{
	136	int32_t weight_idx;
	137	unsigned char rule_idx;
535e935a	138	uint32_t pass;
0f9e5854	139	size_t needed = 0;
2f334ad5	140	size_t last_needed;
ccadf7b5	141
0f9e5854 LH	142	/* Now the passes over the weights. */
0f9e5854 LH	143	for (pass = 0; pass < l_data->nrules; ++pass)
ccadf7b5	144	{
0f9e5854 LH	145	size_t backw_len = 0;
	146	last_needed = needed;
	147	const USTRING_TYPE *cur = usrc;
	148	const USTRING_TYPE *backw_start = NULL;
ccadf7b5	149
0f9e5854 LH	150	/* We assume that if a rule has defined `position' in one section
	151	this is true for all of them. */
	152	int position = find_position (cur, l_data, pass);
ccadf7b5	153
0f9e5854 LH	154	if (position == 0)
	155	{
	156	while (*cur != L('\0'))
	157	{
	158	const USTRING_TYPE *pos = cur;
	159	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
	160	pass);
	161	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
ccadf7b5	162
0f9e5854 LH	163	if ((rule & sort_forward) != 0)
	164	{
	165	/* Handle the pushed backward sequence. */
	166	if (backw_start != NULL)
	167	{
	168	for (size_t i = backw_len; i > 0; )
	169	{
	170	int32_t weight_idx;
	171	unsigned char rule_idx;
	172	size_t len = find_idx (&backw_start, &weight_idx,
	173	&rule_idx, l_data, pass);
	174	if (needed + i < n)
	175	for (size_t j = len; j > 0; j--)
	176	dest[needed + i - j] =
	177	l_data->weights[weight_idx++];
	178
	179	i -= len;
	180	}
ccadf7b5	181
0f9e5854 LH	182	needed += backw_len;
	183	backw_start = NULL;
	184	backw_len = 0;
	185	}
ccadf7b5	186
0f9e5854 LH	187	/* Now handle the forward element. */
	188	if (needed + len < n)
	189	while (len-- > 0)
	190	dest[needed++] = l_data->weights[weight_idx++];
	191	else
	192	/* No more characters fit into the buffer. */
	193	needed += len;
	194	}
	195	else
	196	{
	197	/* Remember start of the backward sequence & track length. */
	198	if (backw_start == NULL)
	199	backw_start = pos;
	200	backw_len += len;
	201	}
	202	}
	203
	204
	205	/* Handle the pushed backward sequence. */
	206	if (backw_start != NULL)
	207	{
	208	for (size_t i = backw_len; i > 0; )
	209	{
	210	size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
	211	l_data, pass);
	212	if (needed + i < n)
	213	for (size_t j = len; j > 0; j--)
	214	dest[needed + i - j] =
	215	l_data->weights[weight_idx++];
	216
	217	i -= len;
	218	}
	219
	220	needed += backw_len;
	221	}
	222	}
	223	else
	224	{
	225	int val = 1;
	226	#ifndef WIDE_CHAR_VERSION
	227	char buf[7];
	228	size_t buflen;
	229	#endif
	230	size_t i;
	231
	232	while (*cur != L('\0'))
	233	{
	234	const USTRING_TYPE *pos = cur;
	235	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
	236	pass);
	237	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
	238
	239	if ((rule & sort_forward) != 0)
	240	{
	241	/* Handle the pushed backward sequence. */
	242	if (backw_start != NULL)
	243	{
	244	for (size_t p = backw_len; p > 0; p--)
	245	{
	246	size_t len;
	247	int32_t weight_idx;
	248	unsigned char rule_idx;
	249	const USTRING_TYPE *backw_cur = backw_start;
	250
251	/* To prevent a warning init the used vars. */
252	len = find_idx (&backw_cur, &weight_idx,
253	&rule_idx, l_data, pass);
254
255	for (i = 1; i < p; i++)
256	len = find_idx (&backw_cur, &weight_idx,
257	&rule_idx, l_data, pass);
258
259	if (len != 0)
260	{
261	#ifdef WIDE_CHAR_VERSION
262	if (needed + 1 + len < n)
263	{
264	dest[needed] = val;
265	for (i = 0; i < len; ++i)
266	dest[needed + 1 + i] =
267	l_data->weights[weight_idx + i];
268	}
269	needed += 1 + len;
270	#else
271	buflen = utf8_encode (buf, val);
272	if (needed + buflen + len < n)
273	{
274	for (i = 0; i < buflen; ++i)
275	dest[needed + i] = buf[i];
276	for (i = 0; i < len; ++i)
277	dest[needed + buflen + i] =
278	l_data->weights[weight_idx + i];
279	}
280	needed += buflen + len;
281	#endif
282	val = 1;
283	}
284	else
285	++val;
286	}
287
288	backw_start = NULL;
289	backw_len = 0;
290	}
291
292	/* Now handle the forward element. */
293	if (len != 0)
294	{
295	#ifdef WIDE_CHAR_VERSION
296	if (needed + 1 + len < n)
297	{
298	dest[needed] = val;
299	for (i = 0; i < len; ++i)
300	dest[needed + 1 + i] =
301	l_data->weights[weight_idx + i];
302	}
303	needed += 1 + len;
304	#else
305	buflen = utf8_encode (buf, val);
306	if (needed + buflen + len < n)
307	{
308	for (i = 0; i < buflen; ++i)
309	dest[needed + i] = buf[i];
310	for (i = 0; i < len; ++i)
311	dest[needed + buflen + i] =
312	l_data->weights[weight_idx + i];
313	}
314	needed += buflen + len;
315	#endif
316	val = 1;
317	}
318	else
319	++val;
320	}
321	else
322	{
323	/* Remember start of the backward sequence & track length. */
324	if (backw_start == NULL)
325	backw_start = pos;
326	backw_len++;
327	}
328	}
329
330	/* Handle the pushed backward sequence. */
331	if (backw_start != NULL)
332	{
333	for (size_t p = backw_len; p > 0; p--)
334	{
335	size_t len;
336	int32_t weight_idx;
337	unsigned char rule_idx;
338	const USTRING_TYPE *backw_cur = backw_start;
339
340	/* To prevent a warning init the used vars. */
341	len = find_idx (&backw_cur, &weight_idx,
342	&rule_idx, l_data, pass);
343
344	for (i = 1; i < p; i++)
345	len = find_idx (&backw_cur, &weight_idx,
346	&rule_idx, l_data, pass);
347
348	if (len != 0)
349	{
350	#ifdef WIDE_CHAR_VERSION
351	if (needed + 1 + len < n)
352	{
353	dest[needed] = val;
354	for (i = 0; i < len; ++i)
355	dest[needed + 1 + i] =
356	l_data->weights[weight_idx + i];
357	}
358	needed += 1 + len;
359	#else
360	buflen = utf8_encode (buf, val);
361	if (needed + buflen + len < n)
362	{
363	for (i = 0; i < buflen; ++i)
364	dest[needed + i] = buf[i];
365	for (i = 0; i < len; ++i)
366	dest[needed + buflen + i] =
367	l_data->weights[weight_idx + i];
368	}
369	needed += buflen + len;
370	#endif
371	val = 1;
372	}
373	else
374	++val;
375	}
376	}
377	}
378
379	/* Finally store the byte to separate the passes or terminate
380	the string. */
381	if (needed < n)
382	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
383	++needed;
ccadf7b5 UD	384	}
ccadf7b5 UD	385
0f9e5854 LH	386	/* This is a little optimization: many collation specifications have
	387	a `position' rule at the end and if no non-ignored character
	388	is found the last \1 byte is immediately followed by a \0 byte
	389	signalling this. We can avoid the \1 byte(s). */
	390	if (needed > 2 && needed == last_needed + 1)
ccadf7b5	391	{
0f9e5854 LH	392	/* Remove the \1 byte. */
	393	if (--needed <= n)
	394	dest[needed - 1] = L('\0');
ccadf7b5	395	}
ccadf7b5	396
0f9e5854 LH	397	/* Return the number of bytes/words we need, but don't count the NUL
	398	byte/word at the end. */
	399	return needed - 1;
	400	}
	401
	402	/* Do the transformation using weight-index and rule cache. */
	403	static size_t
	404	do_xfrm_cached (STRING_TYPE dest, size_t n, const locale_data_t l_data,
	405	size_t idxmax, int32_t idxarr, const unsigned char rulearr)
	406	{
535e935a	407	uint32_t nrules = l_data->nrules;
0f9e5854 LH	408	unsigned char *rulesets = l_data->rulesets;
0f9e5854 LH	409	USTRING_TYPE *weights = l_data->weights;
535e935a	410	uint32_t pass;
0f9e5854 LH	411	size_t needed = 0;
	412	size_t last_needed;
	413	size_t idxcnt;
ccadf7b5	414
0f9e5854	415	/* Now the passes over the weights. */
ccadf7b5 UD	416	for (pass = 0; pass < nrules; ++pass)
	417	{
	418	size_t backw_stop = ~0ul;
	419	int rule = rulesets[rulearr[0] * nrules + pass];
	420	/* We assume that if a rule has defined `position' in one section
	421	this is true for all of them. */
	422	int position = rule & sort_position;
	423
2f334ad5	424	last_needed = needed;
ccadf7b5 UD	425	if (position == 0)
	426	{
	427	for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
	428	{
	429	if ((rule & sort_forward) != 0)
	430	{
	431	size_t len;
	432
	433	if (backw_stop != ~0ul)
	434	{
	435	/* Handle the pushed elements now. */
	436	size_t backw;
	437
bc3a45ce	438	for (backw = idxcnt; backw > backw_stop; )
ccadf7b5	439	{
bc3a45ce	440	--backw;
ccadf7b5 UD	441	len = weights[idxarr[backw]++];
	442
	443	if (needed + len < n)
	444	while (len-- > 0)
	445	dest[needed++] = weights[idxarr[backw]++];
	446	else
	447	{
	448	/* No more characters fit into the buffer. */
	449	needed += len;
	450	idxarr[backw] += len;
	451	}
	452	}
	453
	454	backw_stop = ~0ul;
	455	}
	456
	457	/* Now handle the forward element. */
	458	len = weights[idxarr[idxcnt]++];
	459	if (needed + len < n)
	460	while (len-- > 0)
	461	dest[needed++] = weights[idxarr[idxcnt]++];
	462	else
	463	{
	464	/* No more characters fit into the buffer. */
	465	needed += len;
	466	idxarr[idxcnt] += len;
	467	}
	468	}
	469	else
	470	{
	471	/* Remember where the backwards series started. */
	472	if (backw_stop == ~0ul)
	473	backw_stop = idxcnt;
	474	}
	475
	476	rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
	477	}
	478
	479
	480	if (backw_stop != ~0ul)
	481	{
	482	/* Handle the pushed elements now. */
	483	size_t backw;
	484
	485	backw = idxcnt;
	486	while (backw > backw_stop)
	487	{
	488	size_t len = weights[idxarr[--backw]++];
	489
	490	if (needed + len < n)
	491	while (len-- > 0)
	492	dest[needed++] = weights[idxarr[backw]++];
	493	else
	494	{
	495	/* No more characters fit into the buffer. */
	496	needed += len;
	497	idxarr[backw] += len;
	498	}
	499	}
	500	}
	501	}
	502	else
	503	{
	504	int val = 1;
505	#ifndef WIDE_CHAR_VERSION
506	char buf[7];
507	size_t buflen;
508	#endif
509	size_t i;
510
511	for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
512	{
513	if ((rule & sort_forward) != 0)
514	{
515	size_t len;
516
517	if (backw_stop != ~0ul)
518	{
519	/* Handle the pushed elements now. */
520	size_t backw;
521
bc3a45ce	522	for (backw = idxcnt; backw > backw_stop; )
ccadf7b5	523	{
bc3a45ce	524	--backw;
ccadf7b5 UD	525	len = weights[idxarr[backw]++];
	526	if (len != 0)
	527	{
	528	#ifdef WIDE_CHAR_VERSION
	529	if (needed + 1 + len < n)
	530	{
	531	dest[needed] = val;
	532	for (i = 0; i < len; ++i)
	533	dest[needed + 1 + i] =
	534	weights[idxarr[backw] + i];
	535	}
	536	needed += 1 + len;
	537	#else
	538	buflen = utf8_encode (buf, val);
	539	if (needed + buflen + len < n)
	540	{
	541	for (i = 0; i < buflen; ++i)
	542	dest[needed + i] = buf[i];
	543	for (i = 0; i < len; ++i)
	544	dest[needed + buflen + i] =
	545	weights[idxarr[backw] + i];
	546	}
	547	needed += buflen + len;
	548	#endif
	549	idxarr[backw] += len;
	550	val = 1;
	551	}
	552	else
	553	++val;
	554	}
	555
	556	backw_stop = ~0ul;
	557	}
	558
	559	/* Now handle the forward element. */
	560	len = weights[idxarr[idxcnt]++];
	561	if (len != 0)
	562	{
	563	#ifdef WIDE_CHAR_VERSION
	564	if (needed + 1+ len < n)
	565	{
	566	dest[needed] = val;
	567	for (i = 0; i < len; ++i)
	568	dest[needed + 1 + i] =
	569	weights[idxarr[idxcnt] + i];
	570	}
	571	needed += 1 + len;
	572	#else
	573	buflen = utf8_encode (buf, val);
	574	if (needed + buflen + len < n)
	575	{
	576	for (i = 0; i < buflen; ++i)
	577	dest[needed + i] = buf[i];
	578	for (i = 0; i < len; ++i)
	579	dest[needed + buflen + i] =
	580	weights[idxarr[idxcnt] + i];
	581	}
	582	needed += buflen + len;
	583	#endif
	584	idxarr[idxcnt] += len;
	585	val = 1;
	586	}
	587	else
	588	/* Note that we don't have to increment `idxarr[idxcnt]'
589	since the length is zero. */
590	++val;
591	}
592	else
593	{
594	/* Remember where the backwards series started. */
595	if (backw_stop == ~0ul)
596	backw_stop = idxcnt;
597	}
598
599	rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
600	}
601
602	if (backw_stop != ~0ul)
603	{
604	/* Handle the pushed elements now. */
605	size_t backw;
606
607	backw = idxmax - 1;
608	while (backw > backw_stop)
609	{
610	size_t len = weights[idxarr[--backw]++];
611	if (len != 0)
612	{
613	#ifdef WIDE_CHAR_VERSION
614	if (needed + 1 + len < n)
615	{
616	dest[needed] = val;
617	for (i = 0; i < len; ++i)
618	dest[needed + 1 + i] =
619	weights[idxarr[backw] + i];
620	}
621	needed += 1 + len;
622	#else
623	buflen = utf8_encode (buf, val);
624	if (needed + buflen + len < n)
625	{
626	for (i = 0; i < buflen; ++i)
627	dest[needed + i] = buf[i];
628	for (i = 0; i < len; ++i)
629	dest[needed + buflen + i] =
630	weights[idxarr[backw] + i];
631	}
632	needed += buflen + len;
633	#endif
634	idxarr[backw] += len;
635	val = 1;
636	}
637	else
638	++val;
639	}
640	}
641	}
642
643	/* Finally store the byte to separate the passes or terminate
644	the string. */
645	if (needed < n)
646	dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
647	++needed;
648	}
649
650	/* This is a little optimization: many collation specifications have
651	a `position' rule at the end and if no non-ignored character
652	is found the last \1 byte is immediately followed by a \0 byte
653	signalling this. We can avoid the \1 byte(s). */
2f334ad5	654	if (needed > 2 && needed == last_needed + 1)
ccadf7b5 UD	655	{
ccadf7b5 UD	656	/* Remove the \1 byte. */
52a33795	657	if (--needed <= n)
2f334ad5	658	dest[needed - 1] = L('\0');
ccadf7b5 UD	659	}
ccadf7b5 UD	660
ccadf7b5 UD	661	/* Return the number of bytes/words we need, but don't count the NUL
	662	byte/word at the end. */
	663	return needed - 1;
	664	}
0f9e5854 LH	665
0f9e5854 LH	666	size_t
af85385f	667	STRXFRM (STRING_TYPE dest, const STRING_TYPE src, size_t n, locale_t l)
0f9e5854 LH	668	{
	669	locale_data_t l_data;
	670	struct __locale_data *current = l->__locales[LC_COLLATE];
	671	l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
	672
	673	/* Handle byte comparison case. */
	674	if (l_data.nrules == 0)
	675	{
	676	size_t srclen = STRLEN (src);
	677
	678	if (n != 0)
	679	STPNCPY (dest, src, MIN (srclen + 1, n));
	680
	681	return srclen;
	682	}
	683
	684	/* Handle an empty string, code hereafter relies on strlen (src) > 0. */
	685	if (*src == L('\0'))
	686	{
	687	if (n != 0)
	688	*dest = L('\0');
	689	return 0;
	690	}
	691
	692	/* Get the locale data. */
	693	l_data.rulesets = (unsigned char *)
	694	current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
	695	l_data.table = (int32_t *)
	696	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
	697	l_data.weights = (USTRING_TYPE *)
	698	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
	699	l_data.extra = (USTRING_TYPE *)
	700	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
	701	l_data.indirect = (int32_t *)
	702	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
	703
	704	assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
	705	assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
	706	assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
	707	assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
	708
	709	/* We need the elements of the string as unsigned values since they
14ef9c18	710	are used as indices. */
0f9e5854 LH	711	const USTRING_TYPE usrc = (const USTRING_TYPE ) src;
	712
	713	/* Allocate cache for small strings on the stack and fill it with weight and
	714	rule indices. If the cache size is not sufficient, continue with the
	715	uncached xfrm version. */
	716	size_t idxmax = 0;
	717	const USTRING_TYPE *cur = usrc;
	718	int32_t idxarr = alloca (SMALL_STR_SIZE sizeof (int32_t));
	719	unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
	720
	721	do
	722	{
	723	int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
	724	-1);
	725	rulearr[idxmax] = tmp >> 24;
	726	idxarr[idxmax] = tmp & 0xffffff;
	727
	728	++idxmax;
	729	}
	730	while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
	731
	732	/* This element is only read, the value never used but to determine
	733	another value which then is ignored. */
	734	rulearr[idxmax] = '\0';
	735
	736	/* Do the transformation. */
	737	if (*cur == L('\0'))
	738	return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
	739	else
	740	return do_xfrm (usrc, dest, n, &l_data);
	741	}
ccadf7b5 UD	742	libc_hidden_def (STRXFRM)
	743
	744	#ifndef WIDE_CHAR_VERSION
1ab62b32	745	weak_alias (__strxfrm_l, strxfrm_l)
ccadf7b5	746	#endif