[thirdparty/glibc.git] / string / strxfrm_l.c

/* Copyright (C) 1995-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Written by Ulrich Drepper <drepper@gnu.org>, 1995.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <assert.h>
#include <langinfo.h>
#include <locale.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>

#ifndef STRING_TYPE
# define STRING_TYPE char
# define USTRING_TYPE unsigned char
# define STRXFRM __strxfrm_l
# define STRLEN strlen
# define STPNCPY __stpncpy
# define WEIGHT_H "../locale/weight.h"
# define SUFFIX	MB
# define L(arg) arg
#endif

#define CONCAT(a,b) CONCAT1(a,b)
#define CONCAT1(a,b) a##b

/* Maximum string size that is calculated with cached indices.  Right now this
   is an arbitrary value open to optimizations.  SMALL_STR_SIZE * 4 has to be
   lower than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
#define SMALL_STR_SIZE 4095

#include "../locale/localeinfo.h"
#include WEIGHT_H

/* Group locale data for shorter parameter lists.  */
typedef struct
{
  uint_fast32_t nrules;
  unsigned char *rulesets;
  USTRING_TYPE *weights;
  int32_t *table;
  USTRING_TYPE *extra;
  int32_t *indirect;
} locale_data_t;

#ifndef WIDE_CHAR_VERSION

/* We need UTF-8 encoding of numbers.  */
static int
utf8_encode (char *buf, int val)
{
  int retval;

  if (val < 0x80)
    {
      *buf++ = (char) val;
      retval = 1;
    }
  else
    {
      int step;

      for (step = 2; step < 6; ++step)
	if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
	  break;
      retval = step;

      *buf = (unsigned char) (~0xff >> step);
      --step;
      do
	{
	  buf[step] = 0x80 | (val & 0x3f);
	  val >>= 6;
	}
      while (--step > 0);
      *buf |= val;
    }

  return retval;
}
#endif

/* Find next weight and rule index.  Inlined since called for every char.  */
static __always_inline size_t
find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
	  unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
{
  int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
			 -1);
  *rule_idx = tmp >> 24;
  int32_t idx = tmp & 0xffffff;
  size_t len = l_data->weights[idx++];

  /* Skip over indices of previous levels.  */
  for (int i = 0; i < pass; i++)
    {
      idx += len;
      len = l_data->weights[idx++];
    }

  *weight_idx = idx;
  return len;
}

static int
find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
	       const int pass)
{
  int32_t weight_idx;
  unsigned char rule_idx;
  const USTRING_TYPE *usrc = us;

  find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
}

/* Do the transformation.  */
static size_t
do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
	 const locale_data_t *l_data)
{
  int32_t weight_idx;
  unsigned char rule_idx;
  uint_fast32_t pass;
  size_t needed = 0;
  size_t last_needed;

  /* Now the passes over the weights.  */
  for (pass = 0; pass < l_data->nrules; ++pass)
    {
      size_t backw_len = 0;
      last_needed = needed;
      const USTRING_TYPE *cur = usrc;
      const USTRING_TYPE *backw_start = NULL;

       /* We assume that if a rule has defined `position' in one section
         this is true for all of them.  */
      int position = find_position (cur, l_data, pass);

      if (position == 0)
	{
	  while (*cur != L('\0'))
	    {
	      const USTRING_TYPE *pos = cur;
	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
				     pass);
	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];

	      if ((rule & sort_forward) != 0)
		{
		  /* Handle the pushed backward sequence.  */
		  if (backw_start != NULL)
		    {
		      for (size_t i = backw_len; i > 0; )
			{
			  int32_t weight_idx;
			  unsigned char rule_idx;
			  size_t len = find_idx (&backw_start, &weight_idx,
						 &rule_idx, l_data, pass);
			  if (needed + i < n)
			    for (size_t j = len; j > 0; j--)
			      dest[needed + i - j] =
				l_data->weights[weight_idx++];

			  i -= len;
			}

		      needed += backw_len;
		      backw_start = NULL;
		      backw_len = 0;
		    }

		  /* Now handle the forward element.  */
		  if (needed + len < n)
		    while (len-- > 0)
		      dest[needed++] = l_data->weights[weight_idx++];
		  else
		    /* No more characters fit into the buffer.  */
		    needed += len;
		}
	      else
		{
		  /* Remember start of the backward sequence & track length.  */
		  if (backw_start == NULL)
		    backw_start = pos;
		  backw_len += len;
		}
	    }


	  /* Handle the pushed backward sequence.  */
	  if (backw_start != NULL)
	    {
	      for (size_t i = backw_len; i > 0; )
		{
		  size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
					 l_data, pass);
		  if (needed + i < n)
		    for (size_t j = len; j > 0; j--)
		      dest[needed + i - j] =
			l_data->weights[weight_idx++];

		  i -= len;
		}

	      needed += backw_len;
	    }
	}
      else
	{
	  int val = 1;
#ifndef WIDE_CHAR_VERSION
	  char buf[7];
	  size_t buflen;
#endif
	  size_t i;

	  while (*cur != L('\0'))
	    {
	      const USTRING_TYPE *pos = cur;
	      size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
				     pass);
	      int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];

	      if ((rule & sort_forward) != 0)
		{
		  /* Handle the pushed backward sequence.  */
		  if (backw_start != NULL)
		    {
		      for (size_t p = backw_len; p > 0; p--)
			{
			  size_t len;
			  int32_t weight_idx;
			  unsigned char rule_idx;
			  const USTRING_TYPE *backw_cur = backw_start;

			  /* To prevent a warning init the used vars.  */
			  len = find_idx (&backw_cur, &weight_idx,
					  &rule_idx, l_data, pass);

			  for (i = 1; i < p; i++)
			    len = find_idx (&backw_cur, &weight_idx,
					    &rule_idx, l_data, pass);

			  if (len != 0)
			    {
#ifdef WIDE_CHAR_VERSION
			      if (needed + 1 + len < n)
				{
				  dest[needed] = val;
				  for (i = 0; i < len; ++i)
				    dest[needed + 1 + i] =
				      l_data->weights[weight_idx + i];
				}
			      needed += 1 + len;
#else
			      buflen = utf8_encode (buf, val);
			      if (needed + buflen + len < n)
				{
				  for (i = 0; i < buflen; ++i)
				    dest[needed + i] = buf[i];
				  for (i = 0; i < len; ++i)
				    dest[needed + buflen + i] =
				      l_data->weights[weight_idx + i];
				}
			      needed += buflen + len;
#endif
			      val = 1;
			    }
			  else
			    ++val;
			}

		      backw_start = NULL;
		      backw_len = 0;
		    }

		  /* Now handle the forward element.  */
		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1 + len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += buflen + len;
#endif
		      val = 1;
		    }
		  else
		    ++val;
		}
	      else
		{
		  /* Remember start of the backward sequence & track length.  */
		  if (backw_start == NULL)
		    backw_start = pos;
		  backw_len++;
		}
	    }

	  /* Handle the pushed backward sequence.  */
	  if (backw_start != NULL)
	    {
	      for (size_t p = backw_len; p > 0; p--)
		{
		  size_t len;
		  int32_t weight_idx;
		  unsigned char rule_idx;
		  const USTRING_TYPE *backw_cur = backw_start;

		  /* To prevent a warning init the used vars.  */
		  len = find_idx (&backw_cur, &weight_idx,
				  &rule_idx, l_data, pass);

		  for (i = 1; i < p; i++)
		    len = find_idx (&backw_cur, &weight_idx,
				    &rule_idx, l_data, pass);

		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1 + len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      l_data->weights[weight_idx + i];
			}
		      needed += buflen + len;
#endif
		      val = 1;
		    }
		  else
		    ++val;
		}
	    }
	}

      /* Finally store the byte to separate the passes or terminate
	 the string.  */
      if (needed < n)
	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
      ++needed;
    }

  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1)
    {
      /* Remove the \1 byte.  */
      if (--needed <= n)
	dest[needed - 1] = L('\0');
    }

  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
}

/* Do the transformation using weight-index and rule cache.  */
static size_t
do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
		size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
{
  uint_fast32_t nrules = l_data->nrules;
  unsigned char *rulesets = l_data->rulesets;
  USTRING_TYPE *weights = l_data->weights;
  uint_fast32_t pass;
  size_t needed = 0;
  size_t last_needed;
  size_t idxcnt;

  /* Now the passes over the weights.  */
  for (pass = 0; pass < nrules; ++pass)
    {
      size_t backw_stop = ~0ul;
      int rule = rulesets[rulearr[0] * nrules + pass];
      /* We assume that if a rule has defined `position' in one section
	 this is true for all of them.  */
      int position = rule & sort_position;

      last_needed = needed;
      if (position == 0)
	{
	  for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
	    {
	      if ((rule & sort_forward) != 0)
		{
		  size_t len;

		  if (backw_stop != ~0ul)
		    {
		      /* Handle the pushed elements now.  */
		      size_t backw;

		      for (backw = idxcnt; backw > backw_stop; )
			{
			  --backw;
			  len = weights[idxarr[backw]++];

			  if (needed + len < n)
			    while (len-- > 0)
			      dest[needed++] = weights[idxarr[backw]++];
			  else
			    {
				/* No more characters fit into the buffer.  */
			      needed += len;
			      idxarr[backw] += len;
			    }
			}

		      backw_stop = ~0ul;
		    }

		  /* Now handle the forward element.  */
		  len = weights[idxarr[idxcnt]++];
		  if (needed + len < n)
		    while (len-- > 0)
		      dest[needed++] = weights[idxarr[idxcnt]++];
		  else
		    {
		      /* No more characters fit into the buffer.  */
		      needed += len;
		      idxarr[idxcnt] += len;
		    }
		}
	      else
		{
		  /* Remember where the backwards series started.  */
		  if (backw_stop == ~0ul)
		    backw_stop = idxcnt;
		}

	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
	    }


	  if (backw_stop != ~0ul)
	    {
	      /* Handle the pushed elements now.  */
	      size_t backw;

	      backw = idxcnt;
	      while (backw > backw_stop)
		{
		  size_t len = weights[idxarr[--backw]++];

		  if (needed + len < n)
		    while (len-- > 0)
		      dest[needed++] = weights[idxarr[backw]++];
		  else
		    {
		      /* No more characters fit into the buffer.  */
		      needed += len;
		      idxarr[backw] += len;
		    }
		}
	    }
	}
      else
	{
	  int val = 1;
#ifndef WIDE_CHAR_VERSION
	  char buf[7];
	  size_t buflen;
#endif
	  size_t i;

	  for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
	    {
	      if ((rule & sort_forward) != 0)
		{
		  size_t len;

		  if (backw_stop != ~0ul)
		    {
		     /* Handle the pushed elements now.  */
		      size_t backw;

		      for (backw = idxcnt; backw > backw_stop; )
			{
			  --backw;
			  len = weights[idxarr[backw]++];
			  if (len != 0)
			    {
#ifdef WIDE_CHAR_VERSION
			      if (needed + 1 + len < n)
				{
				  dest[needed] = val;
				  for (i = 0; i < len; ++i)
				    dest[needed + 1 + i] =
				      weights[idxarr[backw] + i];
				}
			      needed += 1 + len;
#else
			      buflen = utf8_encode (buf, val);
			      if (needed + buflen + len < n)
				{
				  for (i = 0; i < buflen; ++i)
				    dest[needed + i] = buf[i];
				  for (i = 0; i < len; ++i)
				    dest[needed + buflen + i] =
				      weights[idxarr[backw] + i];
				}
			      needed += buflen + len;
#endif
			      idxarr[backw] += len;
			      val = 1;
			    }
			  else
			    ++val;
			}

		      backw_stop = ~0ul;
		    }

		  /* Now handle the forward element.  */
		  len = weights[idxarr[idxcnt]++];
		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1+ len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      weights[idxarr[idxcnt] + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      weights[idxarr[idxcnt] + i];
			}
		      needed += buflen + len;
#endif
		      idxarr[idxcnt] += len;
		      val = 1;
		    }
		  else
		    /* Note that we don't have to increment `idxarr[idxcnt]'
		       since the length is zero.  */
		    ++val;
		}
	      else
		{
		  /* Remember where the backwards series started.  */
		  if (backw_stop == ~0ul)
		    backw_stop = idxcnt;
		}

	      rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
	    }

	  if (backw_stop != ~0ul)
	    {
	      /* Handle the pushed elements now.  */
	      size_t backw;

	      backw = idxmax - 1;
	      while (backw > backw_stop)
		{
		  size_t len = weights[idxarr[--backw]++];
		  if (len != 0)
		    {
#ifdef WIDE_CHAR_VERSION
		      if (needed + 1 + len < n)
			{
			  dest[needed] = val;
			  for (i = 0; i < len; ++i)
			    dest[needed + 1 + i] =
			      weights[idxarr[backw] + i];
			}
		      needed += 1 + len;
#else
		      buflen = utf8_encode (buf, val);
		      if (needed + buflen + len < n)
			{
			  for (i = 0; i < buflen; ++i)
			    dest[needed + i] = buf[i];
			  for (i = 0; i < len; ++i)
			    dest[needed + buflen + i] =
			      weights[idxarr[backw] + i];
			}
		      needed += buflen + len;
#endif
		      idxarr[backw] += len;
		      val = 1;
		    }
		  else
		    ++val;
		}
	    }
	}

      /* Finally store the byte to separate the passes or terminate
	 the string.  */
      if (needed < n)
	dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
      ++needed;
    }

  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1)
    {
      /* Remove the \1 byte.  */
      if (--needed <= n)
	dest[needed - 1] = L('\0');
    }

  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
}

size_t
STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, locale_t l)
{
  locale_data_t l_data;
  struct __locale_data *current = l->__locales[LC_COLLATE];
  l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;

  /* Handle byte comparison case.  */
  if (l_data.nrules == 0)
    {
      size_t srclen = STRLEN (src);

      if (n != 0)
	STPNCPY (dest, src, MIN (srclen + 1, n));

      return srclen;
    }

  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
  if (*src == L('\0'))
    {
      if (n != 0)
	*dest = L('\0');
      return 0;
    }

  /* Get the locale data.  */
  l_data.rulesets = (unsigned char *)
    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
  l_data.table = (int32_t *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
  l_data.weights = (USTRING_TYPE *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
  l_data.extra = (USTRING_TYPE *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
  l_data.indirect = (int32_t *)
    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;

  assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
  assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
  assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
  assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);

  /* We need the elements of the string as unsigned values since they
     are used as indeces.  */
  const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;

  /* Allocate cache for small strings on the stack and fill it with weight and
     rule indices.  If the cache size is not sufficient, continue with the
     uncached xfrm version.  */
  size_t idxmax = 0;
  const USTRING_TYPE *cur = usrc;
  int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
  unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);

  do
    {
      int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
			     -1);
      rulearr[idxmax] = tmp >> 24;
      idxarr[idxmax] = tmp & 0xffffff;

      ++idxmax;
    }
  while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);

  /* This element is only read, the value never used but to determine
     another value which then is ignored.  */
  rulearr[idxmax] = '\0';

  /* Do the transformation.  */
  if (*cur == L('\0'))
    return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
  else
    return do_xfrm (usrc, dest, n, &l_data);
}
libc_hidden_def (STRXFRM)

#ifndef WIDE_CHAR_VERSION
weak_alias (__strxfrm_l, strxfrm_l)
#endif
Commit	Line	Data
	1	/* Copyright (C) 1995-2019 Free Software Foundation, Inc.
	2	This file is part of the GNU C Library.
	3	Written by Ulrich Drepper <drepper@gnu.org>, 1995.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>. */
	18
	19	#include <assert.h>
	20	#include <langinfo.h>
	21	#include <locale.h>
	22	#include <stddef.h>
	23	#include <stdint.h>
	24	#include <stdlib.h>
	25	#include <string.h>
	26	#include <sys/param.h>
	27
	28	#ifndef STRING_TYPE
	29	# define STRING_TYPE char
	30	# define USTRING_TYPE unsigned char
	31	# define STRXFRM __strxfrm_l
	32	# define STRLEN strlen
	33	# define STPNCPY __stpncpy
	34	# define WEIGHT_H "../locale/weight.h"
	35	# define SUFFIX MB
	36	# define L(arg) arg
	37	#endif
	38
	39	#define CONCAT(a,b) CONCAT1(a,b)
	40	#define CONCAT1(a,b) a##b
	41
	42	/* Maximum string size that is calculated with cached indices. Right now this
	43	is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
	44	lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
	45	#define SMALL_STR_SIZE 4095
	46
	47	#include "../locale/localeinfo.h"
	48	#include WEIGHT_H
	49
	50	/* Group locale data for shorter parameter lists. */
	51	typedef struct
	52	{
	53	uint_fast32_t nrules;
	54	unsigned char *rulesets;
	55	USTRING_TYPE *weights;
	56	int32_t *table;
	57	USTRING_TYPE *extra;
	58	int32_t *indirect;
	59	} locale_data_t;
	60
	61	#ifndef WIDE_CHAR_VERSION
	62
	63	/* We need UTF-8 encoding of numbers. */
	64	static int
	65	utf8_encode (char *buf, int val)
	66	{
	67	int retval;
	68
	69	if (val < 0x80)
	70	{
	71	*buf++ = (char) val;
	72	retval = 1;
	73	}
	74	else
	75	{
	76	int step;
	77
	78	for (step = 2; step < 6; ++step)
	79	if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
	80	break;
	81	retval = step;
	82
	83	*buf = (unsigned char) (~0xff >> step);
	84	--step;
	85	do
	86	{
	87	buf[step] = 0x80 \| (val & 0x3f);
	88	val >>= 6;
	89	}
	90	while (--step > 0);
	91	*buf \|= val;
	92	}
	93
	94	return retval;
	95	}
	96	#endif
	97
	98	/* Find next weight and rule index. Inlined since called for every char. */
	99	static __always_inline size_t
	100	find_idx (const USTRING_TYPE *us, int32_t weight_idx,
	101	unsigned char rule_idx, const locale_data_t l_data, const int pass)
	102	{
	103	int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
	104	-1);
	105	*rule_idx = tmp >> 24;
	106	int32_t idx = tmp & 0xffffff;
	107	size_t len = l_data->weights[idx++];
	108
	109	/* Skip over indices of previous levels. */
	110	for (int i = 0; i < pass; i++)
	111	{
	112	idx += len;
	113	len = l_data->weights[idx++];
	114	}
	115
	116	*weight_idx = idx;
	117	return len;
	118	}
	119
	120	static int
	121	find_position (const USTRING_TYPE us, const locale_data_t l_data,
	122	const int pass)
	123	{
	124	int32_t weight_idx;
	125	unsigned char rule_idx;
	126	const USTRING_TYPE *usrc = us;
	127
	128	find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
	129	return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
	130	}
	131
	132	/* Do the transformation. */
	133	static size_t
	134	do_xfrm (const USTRING_TYPE usrc, STRING_TYPE dest, size_t n,
	135	const locale_data_t *l_data)
	136	{
	137	int32_t weight_idx;
	138	unsigned char rule_idx;
	139	uint_fast32_t pass;
	140	size_t needed = 0;
	141	size_t last_needed;
	142
	143	/* Now the passes over the weights. */
	144	for (pass = 0; pass < l_data->nrules; ++pass)
	145	{
	146	size_t backw_len = 0;
	147	last_needed = needed;
	148	const USTRING_TYPE *cur = usrc;
	149	const USTRING_TYPE *backw_start = NULL;
	150
	151	/* We assume that if a rule has defined `position' in one section
	152	this is true for all of them. */
	153	int position = find_position (cur, l_data, pass);
	154
	155	if (position == 0)
	156	{
	157	while (*cur != L('\0'))
	158	{
	159	const USTRING_TYPE *pos = cur;
	160	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
	161	pass);
	162	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
	163
	164	if ((rule & sort_forward) != 0)
	165	{
	166	/* Handle the pushed backward sequence. */
	167	if (backw_start != NULL)
	168	{
	169	for (size_t i = backw_len; i > 0; )
	170	{
	171	int32_t weight_idx;
	172	unsigned char rule_idx;
	173	size_t len = find_idx (&backw_start, &weight_idx,
	174	&rule_idx, l_data, pass);
	175	if (needed + i < n)
	176	for (size_t j = len; j > 0; j--)
	177	dest[needed + i - j] =
	178	l_data->weights[weight_idx++];
	179
	180	i -= len;
	181	}
	182
	183	needed += backw_len;
	184	backw_start = NULL;
	185	backw_len = 0;
	186	}
	187
	188	/* Now handle the forward element. */
	189	if (needed + len < n)
	190	while (len-- > 0)
	191	dest[needed++] = l_data->weights[weight_idx++];
	192	else
	193	/* No more characters fit into the buffer. */
	194	needed += len;
	195	}
	196	else
	197	{
	198	/* Remember start of the backward sequence & track length. */
	199	if (backw_start == NULL)
	200	backw_start = pos;
	201	backw_len += len;
	202	}
	203	}
	204
	205
	206	/* Handle the pushed backward sequence. */
	207	if (backw_start != NULL)
	208	{
	209	for (size_t i = backw_len; i > 0; )
	210	{
	211	size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
	212	l_data, pass);
	213	if (needed + i < n)
	214	for (size_t j = len; j > 0; j--)
	215	dest[needed + i - j] =
	216	l_data->weights[weight_idx++];
	217
	218	i -= len;
	219	}
	220
	221	needed += backw_len;
	222	}
	223	}
	224	else
	225	{
	226	int val = 1;
	227	#ifndef WIDE_CHAR_VERSION
	228	char buf[7];
	229	size_t buflen;
	230	#endif
	231	size_t i;
	232
	233	while (*cur != L('\0'))
	234	{
	235	const USTRING_TYPE *pos = cur;
	236	size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
	237	pass);
	238	int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
	239
	240	if ((rule & sort_forward) != 0)
	241	{
	242	/* Handle the pushed backward sequence. */
	243	if (backw_start != NULL)
	244	{
	245	for (size_t p = backw_len; p > 0; p--)
	246	{
	247	size_t len;
	248	int32_t weight_idx;
	249	unsigned char rule_idx;
	250	const USTRING_TYPE *backw_cur = backw_start;
	251
	252	/* To prevent a warning init the used vars. */
	253	len = find_idx (&backw_cur, &weight_idx,
	254	&rule_idx, l_data, pass);
	255
	256	for (i = 1; i < p; i++)
	257	len = find_idx (&backw_cur, &weight_idx,
	258	&rule_idx, l_data, pass);
	259
	260	if (len != 0)
	261	{
	262	#ifdef WIDE_CHAR_VERSION
	263	if (needed + 1 + len < n)
	264	{
	265	dest[needed] = val;
	266	for (i = 0; i < len; ++i)
	267	dest[needed + 1 + i] =
	268	l_data->weights[weight_idx + i];
	269	}
	270	needed += 1 + len;
	271	#else
	272	buflen = utf8_encode (buf, val);
	273	if (needed + buflen + len < n)
	274	{
	275	for (i = 0; i < buflen; ++i)
	276	dest[needed + i] = buf[i];
	277	for (i = 0; i < len; ++i)
	278	dest[needed + buflen + i] =
	279	l_data->weights[weight_idx + i];
	280	}
	281	needed += buflen + len;
	282	#endif
	283	val = 1;
	284	}
	285	else
	286	++val;
	287	}
	288
	289	backw_start = NULL;
	290	backw_len = 0;
	291	}
	292
	293	/* Now handle the forward element. */
	294	if (len != 0)
	295	{
	296	#ifdef WIDE_CHAR_VERSION
	297	if (needed + 1 + len < n)
	298	{
	299	dest[needed] = val;
	300	for (i = 0; i < len; ++i)
	301	dest[needed + 1 + i] =
	302	l_data->weights[weight_idx + i];
	303	}
	304	needed += 1 + len;
	305	#else
	306	buflen = utf8_encode (buf, val);
	307	if (needed + buflen + len < n)
	308	{
	309	for (i = 0; i < buflen; ++i)
	310	dest[needed + i] = buf[i];
	311	for (i = 0; i < len; ++i)
	312	dest[needed + buflen + i] =
	313	l_data->weights[weight_idx + i];
	314	}
	315	needed += buflen + len;
	316	#endif
	317	val = 1;
	318	}
	319	else
	320	++val;
	321	}
	322	else
	323	{
	324	/* Remember start of the backward sequence & track length. */
	325	if (backw_start == NULL)
	326	backw_start = pos;
	327	backw_len++;
	328	}
	329	}
	330
	331	/* Handle the pushed backward sequence. */
	332	if (backw_start != NULL)
	333	{
	334	for (size_t p = backw_len; p > 0; p--)
	335	{
	336	size_t len;
	337	int32_t weight_idx;
	338	unsigned char rule_idx;
	339	const USTRING_TYPE *backw_cur = backw_start;
	340
	341	/* To prevent a warning init the used vars. */
	342	len = find_idx (&backw_cur, &weight_idx,
	343	&rule_idx, l_data, pass);
	344
	345	for (i = 1; i < p; i++)
	346	len = find_idx (&backw_cur, &weight_idx,
	347	&rule_idx, l_data, pass);
	348
	349	if (len != 0)
	350	{
	351	#ifdef WIDE_CHAR_VERSION
	352	if (needed + 1 + len < n)
	353	{
	354	dest[needed] = val;
	355	for (i = 0; i < len; ++i)
	356	dest[needed + 1 + i] =
	357	l_data->weights[weight_idx + i];
	358	}
	359	needed += 1 + len;
	360	#else
	361	buflen = utf8_encode (buf, val);
	362	if (needed + buflen + len < n)
	363	{
	364	for (i = 0; i < buflen; ++i)
	365	dest[needed + i] = buf[i];
	366	for (i = 0; i < len; ++i)
	367	dest[needed + buflen + i] =
	368	l_data->weights[weight_idx + i];
	369	}
	370	needed += buflen + len;
	371	#endif
	372	val = 1;
	373	}
	374	else
	375	++val;
	376	}
	377	}
	378	}
	379
	380	/* Finally store the byte to separate the passes or terminate
	381	the string. */
	382	if (needed < n)
	383	dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
	384	++needed;
	385	}
	386
	387	/* This is a little optimization: many collation specifications have
	388	a `position' rule at the end and if no non-ignored character
	389	is found the last \1 byte is immediately followed by a \0 byte
	390	signalling this. We can avoid the \1 byte(s). */
	391	if (needed > 2 && needed == last_needed + 1)
	392	{
	393	/* Remove the \1 byte. */
	394	if (--needed <= n)
	395	dest[needed - 1] = L('\0');
	396	}
	397
	398	/* Return the number of bytes/words we need, but don't count the NUL
	399	byte/word at the end. */
	400	return needed - 1;
	401	}
	402
	403	/* Do the transformation using weight-index and rule cache. */
	404	static size_t
	405	do_xfrm_cached (STRING_TYPE dest, size_t n, const locale_data_t l_data,
	406	size_t idxmax, int32_t idxarr, const unsigned char rulearr)
	407	{
	408	uint_fast32_t nrules = l_data->nrules;
	409	unsigned char *rulesets = l_data->rulesets;
	410	USTRING_TYPE *weights = l_data->weights;
	411	uint_fast32_t pass;
	412	size_t needed = 0;
	413	size_t last_needed;
	414	size_t idxcnt;
	415
	416	/* Now the passes over the weights. */
	417	for (pass = 0; pass < nrules; ++pass)
	418	{
	419	size_t backw_stop = ~0ul;
	420	int rule = rulesets[rulearr[0] * nrules + pass];
	421	/* We assume that if a rule has defined `position' in one section
	422	this is true for all of them. */
	423	int position = rule & sort_position;
	424
	425	last_needed = needed;
	426	if (position == 0)
	427	{
	428	for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
	429	{
	430	if ((rule & sort_forward) != 0)
	431	{
	432	size_t len;
	433
	434	if (backw_stop != ~0ul)
	435	{
	436	/* Handle the pushed elements now. */
	437	size_t backw;
	438
	439	for (backw = idxcnt; backw > backw_stop; )
	440	{
	441	--backw;
	442	len = weights[idxarr[backw]++];
	443
	444	if (needed + len < n)
	445	while (len-- > 0)
	446	dest[needed++] = weights[idxarr[backw]++];
	447	else
	448	{
	449	/* No more characters fit into the buffer. */
	450	needed += len;
	451	idxarr[backw] += len;
	452	}
	453	}
	454
	455	backw_stop = ~0ul;
	456	}
	457
	458	/* Now handle the forward element. */
	459	len = weights[idxarr[idxcnt]++];
	460	if (needed + len < n)
	461	while (len-- > 0)
	462	dest[needed++] = weights[idxarr[idxcnt]++];
	463	else
	464	{
	465	/* No more characters fit into the buffer. */
	466	needed += len;
	467	idxarr[idxcnt] += len;
	468	}
	469	}
	470	else
	471	{
	472	/* Remember where the backwards series started. */
	473	if (backw_stop == ~0ul)
	474	backw_stop = idxcnt;
	475	}
	476
	477	rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
	478	}
	479
	480
	481	if (backw_stop != ~0ul)
	482	{
	483	/* Handle the pushed elements now. */
	484	size_t backw;
	485
	486	backw = idxcnt;
	487	while (backw > backw_stop)
	488	{
	489	size_t len = weights[idxarr[--backw]++];
	490
	491	if (needed + len < n)
	492	while (len-- > 0)
	493	dest[needed++] = weights[idxarr[backw]++];
	494	else
	495	{
	496	/* No more characters fit into the buffer. */
	497	needed += len;
	498	idxarr[backw] += len;
	499	}
	500	}
	501	}
	502	}
	503	else
	504	{
	505	int val = 1;
	506	#ifndef WIDE_CHAR_VERSION
	507	char buf[7];
	508	size_t buflen;
	509	#endif
	510	size_t i;
	511
	512	for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
	513	{
	514	if ((rule & sort_forward) != 0)
	515	{
	516	size_t len;
	517
	518	if (backw_stop != ~0ul)
	519	{
	520	/* Handle the pushed elements now. */
	521	size_t backw;
	522
	523	for (backw = idxcnt; backw > backw_stop; )
	524	{
	525	--backw;
	526	len = weights[idxarr[backw]++];
	527	if (len != 0)
	528	{
	529	#ifdef WIDE_CHAR_VERSION
	530	if (needed + 1 + len < n)
	531	{
	532	dest[needed] = val;
	533	for (i = 0; i < len; ++i)
	534	dest[needed + 1 + i] =
	535	weights[idxarr[backw] + i];
	536	}
	537	needed += 1 + len;
	538	#else
	539	buflen = utf8_encode (buf, val);
	540	if (needed + buflen + len < n)
	541	{
	542	for (i = 0; i < buflen; ++i)
	543	dest[needed + i] = buf[i];
	544	for (i = 0; i < len; ++i)
	545	dest[needed + buflen + i] =
	546	weights[idxarr[backw] + i];
	547	}
	548	needed += buflen + len;
	549	#endif
	550	idxarr[backw] += len;
	551	val = 1;
	552	}
	553	else
	554	++val;
	555	}
	556
	557	backw_stop = ~0ul;
	558	}
	559
	560	/* Now handle the forward element. */
	561	len = weights[idxarr[idxcnt]++];
	562	if (len != 0)
	563	{
	564	#ifdef WIDE_CHAR_VERSION
	565	if (needed + 1+ len < n)
	566	{
	567	dest[needed] = val;
	568	for (i = 0; i < len; ++i)
	569	dest[needed + 1 + i] =
	570	weights[idxarr[idxcnt] + i];
	571	}
	572	needed += 1 + len;
	573	#else
	574	buflen = utf8_encode (buf, val);
	575	if (needed + buflen + len < n)
	576	{
	577	for (i = 0; i < buflen; ++i)
	578	dest[needed + i] = buf[i];
	579	for (i = 0; i < len; ++i)
	580	dest[needed + buflen + i] =
	581	weights[idxarr[idxcnt] + i];
	582	}
	583	needed += buflen + len;
	584	#endif
	585	idxarr[idxcnt] += len;
	586	val = 1;
	587	}
	588	else
	589	/* Note that we don't have to increment `idxarr[idxcnt]'
	590	since the length is zero. */
	591	++val;
	592	}
	593	else
	594	{
	595	/* Remember where the backwards series started. */
	596	if (backw_stop == ~0ul)
	597	backw_stop = idxcnt;
	598	}
	599
	600	rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
	601	}
	602
	603	if (backw_stop != ~0ul)
	604	{
	605	/* Handle the pushed elements now. */
	606	size_t backw;
	607
	608	backw = idxmax - 1;
	609	while (backw > backw_stop)
	610	{
	611	size_t len = weights[idxarr[--backw]++];
	612	if (len != 0)
	613	{
	614	#ifdef WIDE_CHAR_VERSION
	615	if (needed + 1 + len < n)
	616	{
	617	dest[needed] = val;
	618	for (i = 0; i < len; ++i)
	619	dest[needed + 1 + i] =
	620	weights[idxarr[backw] + i];
	621	}
	622	needed += 1 + len;
	623	#else
	624	buflen = utf8_encode (buf, val);
	625	if (needed + buflen + len < n)
	626	{
	627	for (i = 0; i < buflen; ++i)
	628	dest[needed + i] = buf[i];
	629	for (i = 0; i < len; ++i)
	630	dest[needed + buflen + i] =
	631	weights[idxarr[backw] + i];
	632	}
	633	needed += buflen + len;
	634	#endif
	635	idxarr[backw] += len;
	636	val = 1;
	637	}
	638	else
	639	++val;
	640	}
	641	}
	642	}
	643
	644	/* Finally store the byte to separate the passes or terminate
	645	the string. */
	646	if (needed < n)
	647	dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
	648	++needed;
	649	}
	650
	651	/* This is a little optimization: many collation specifications have
	652	a `position' rule at the end and if no non-ignored character
	653	is found the last \1 byte is immediately followed by a \0 byte
	654	signalling this. We can avoid the \1 byte(s). */
	655	if (needed > 2 && needed == last_needed + 1)
	656	{
	657	/* Remove the \1 byte. */
	658	if (--needed <= n)
	659	dest[needed - 1] = L('\0');
	660	}
	661
	662	/* Return the number of bytes/words we need, but don't count the NUL
	663	byte/word at the end. */
	664	return needed - 1;
	665	}
	666
	667	size_t
	668	STRXFRM (STRING_TYPE dest, const STRING_TYPE src, size_t n, locale_t l)
	669	{
	670	locale_data_t l_data;
	671	struct __locale_data *current = l->__locales[LC_COLLATE];
	672	l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
	673
	674	/* Handle byte comparison case. */
	675	if (l_data.nrules == 0)
	676	{
	677	size_t srclen = STRLEN (src);
	678
	679	if (n != 0)
	680	STPNCPY (dest, src, MIN (srclen + 1, n));
	681
	682	return srclen;
	683	}
	684
	685	/* Handle an empty string, code hereafter relies on strlen (src) > 0. */
	686	if (*src == L('\0'))
	687	{
	688	if (n != 0)
	689	*dest = L('\0');
	690	return 0;
	691	}
	692
	693	/* Get the locale data. */
	694	l_data.rulesets = (unsigned char *)
	695	current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
	696	l_data.table = (int32_t *)
	697	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
	698	l_data.weights = (USTRING_TYPE *)
	699	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
	700	l_data.extra = (USTRING_TYPE *)
	701	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
	702	l_data.indirect = (int32_t *)
	703	current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
	704
	705	assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
	706	assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
	707	assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
	708	assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
	709
	710	/* We need the elements of the string as unsigned values since they
	711	are used as indeces. */
	712	const USTRING_TYPE usrc = (const USTRING_TYPE ) src;
	713
	714	/* Allocate cache for small strings on the stack and fill it with weight and
	715	rule indices. If the cache size is not sufficient, continue with the
	716	uncached xfrm version. */
	717	size_t idxmax = 0;
	718	const USTRING_TYPE *cur = usrc;
	719	int32_t idxarr = alloca (SMALL_STR_SIZE sizeof (int32_t));
	720	unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
	721
	722	do
	723	{
	724	int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
	725	-1);
	726	rulearr[idxmax] = tmp >> 24;
	727	idxarr[idxmax] = tmp & 0xffffff;
	728
	729	++idxmax;
	730	}
	731	while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
	732
	733	/* This element is only read, the value never used but to determine
	734	another value which then is ignored. */
	735	rulearr[idxmax] = '\0';
	736
	737	/* Do the transformation. */
	738	if (*cur == L('\0'))
	739	return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
	740	else
	741	return do_xfrm (usrc, dest, n, &l_data);
	742	}
	743	libc_hidden_def (STRXFRM)
	744
	745	#ifndef WIDE_CHAR_VERSION
	746	weak_alias (__strxfrm_l, strxfrm_l)
	747	#endif