[thirdparty/bash.git] / lib / sh / casemod.c

/* casemod.c -- functions to change case of strings */

/* Copyright (C) 2008,2009 Free Software Foundation, Inc.

   This file is part of GNU Bash, the Bourne Again SHell.

   Bash is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   Bash is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with Bash.  If not, see <http://www.gnu.org/licenses/>.
*/

#if defined (HAVE_CONFIG_H)
#  include <config.h>
#endif

#if defined (HAVE_UNISTD_H)
#  include <unistd.h>
#endif /* HAVE_UNISTD_H */

#include <stdc.h>

#include <bashansi.h>
#include <bashintl.h>
#include <bashtypes.h>

#include <stdio.h>
#include <ctype.h>
#include <xmalloc.h>

#include <shmbchar.h>
#include <shmbutil.h>
#include <chartypes.h>
#include <typemax.h>

#include <glob/strmatch.h>

#define _to_wupper(wc)	(iswlower (wc) ? towupper (wc) : (wc))
#define _to_wlower(wc)	(iswupper (wc) ? towlower (wc) : (wc))

#if !defined (HANDLE_MULTIBYTE)
#  define cval(s, i)	((s)[(i)])
#  define iswalnum(c)	(isalnum(c))
#  define TOGGLE(x)	(ISUPPER (x) ? tolower (x) : (TOUPPER (x)))
#else
#  define TOGGLE(x)	(iswupper (x) ? towlower (x) : (_to_wupper(x)))
#endif

/* These must agree with the defines in externs.h */
#define CASE_NOOP	0x0000
#define CASE_LOWER	0x0001
#define CASE_UPPER	0x0002
#define CASE_CAPITALIZE	0x0004
#define CASE_UNCAP	0x0008
#define CASE_TOGGLE	0x0010
#define CASE_TOGGLEALL	0x0020
#define CASE_UPFIRST	0x0040
#define CASE_LOWFIRST	0x0080

#define CASE_USEWORDS	0x1000		/* modify behavior to act on words in passed string */

extern char *substring __P((char *, int, int));

#ifndef UCHAR_MAX
#  define UCHAR_MAX	TYPE_MAXIMUM(unsigned char)
#endif

#if defined (HANDLE_MULTIBYTE)
static wchar_t
cval (s, i)
     char *s;
     int i;
{
  size_t tmp;
  wchar_t wc;
  int l;
  mbstate_t mps;  

  if (MB_CUR_MAX == 1 || is_basic (s[i]))
    return ((wchar_t)s[i]);
  l = strlen (s);
  if (i >= (l - 1))
    return ((wchar_t)s[i]);
  memset (&mps, 0, sizeof (mbstate_t));
  tmp = mbrtowc (&wc, s + i, l - i, &mps);
  if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
    return ((wchar_t)s[i]);
  return wc;  
}
#endif

/* Modify the case of characters in STRING matching PAT based on the value of
   FLAGS.  If PAT is null, modify the case of each character */
char *
sh_modcase (string, pat, flags)
     const char *string;
     char *pat;
     int flags;
{
  int start, next, end;
  int inword, c, nc, nop, match, usewords;
  char *ret, *s;
  wchar_t wc;
#if defined (HANDLE_MULTIBYTE)
  wchar_t nwc;
  char mb[MB_LEN_MAX+1];
  int mlen;
  size_t m;
  mbstate_t state;
#endif

  if (string == 0 || *string == 0)
    {
      ret = (char *)xmalloc (1);
      ret[0] = '\0';
      return ret;
    }

#if defined (HANDLE_MULTIBYTE)
  memset (&state, 0, sizeof (mbstate_t));
#endif

  start = 0;
  end = strlen (string);

  ret = (char *)xmalloc (end + 1);
  strcpy (ret, string);

  /* See if we are supposed to split on alphanumerics and operate on each word */
  usewords = (flags & CASE_USEWORDS);
  flags &= ~CASE_USEWORDS;

  inword = 0;
  while (start < end)
    {
      wc = cval (ret, start);

      if (iswalnum (wc) == 0)
	{
	  inword = 0;
#if 0
	  ADVANCE_CHAR (ret, end, start);
	  continue;
#endif
	}

      if (pat)
	{
	  next = start;
	  ADVANCE_CHAR (ret, end, next);
	  s = substring (ret, start, next);
	  match = strmatch (pat, s, FNM_EXTMATCH) != FNM_NOMATCH;
	  free (s);
	  if (match == 0)
            {
              start = next;
              inword = 1;
              continue;
            }
	}

      /* XXX - for now, the toggling operators work on the individual
	 words in the string, breaking on alphanumerics.  Should I
	 leave the capitalization operators to do that also? */
      if (flags == CASE_CAPITALIZE)
	{
	  if (usewords)
	    nop = inword ? CASE_LOWER : CASE_UPPER;
	  else
	    nop = (start > 0) ? CASE_LOWER : CASE_UPPER;
	  inword = 1;
	}
      else if (flags == CASE_UNCAP)
	{
	  if (usewords)
	    nop = inword ? CASE_UPPER : CASE_LOWER;
	  else
	    nop = (start > 0) ? CASE_UPPER : CASE_LOWER;
	  inword = 1;
	}
      else if (flags == CASE_UPFIRST)
 	{
 	  if (usewords)
	    nop = inword ? CASE_NOOP : CASE_UPPER;
	  else
	    nop = (start > 0) ? CASE_NOOP : CASE_UPPER;
 	  inword = 1;
 	}
      else if (flags == CASE_LOWFIRST)
 	{
 	  if (usewords)
	    nop = inword ? CASE_NOOP : CASE_LOWER;
	  else
	    nop = (start > 0) ? CASE_NOOP : CASE_LOWER;
 	  inword = 1;
 	}
      else if (flags == CASE_TOGGLE)
	{
	  nop = inword ? CASE_NOOP : CASE_TOGGLE;
	  inword = 1;
	}
      else
	nop = flags;

      /* Need to check UCHAR_MAX since wc may have already been converted to a
	 wide character by cval() */
      if (MB_CUR_MAX == 1 || (wc <= UCHAR_MAX && is_basic ((int)wc)))
	{
singlebyte:
	  switch (nop)
	  {
	  default:
	  case CASE_NOOP:  nc = wc; break;
	  case CASE_UPPER:  nc = TOUPPER (wc); break;
	  case CASE_LOWER:  nc = TOLOWER (wc); break;
	  case CASE_TOGGLEALL:
	  case CASE_TOGGLE: nc = TOGGLE (wc); break;
	  }
	  ret[start] = nc;
	}
#if defined (HANDLE_MULTIBYTE)
      else
	{
	  m = mbrtowc (&wc, string + start, end - start, &state);
	  if (MB_INVALIDCH (m))
	    {
	      wc = (unsigned char)string[start];
	      goto singlebyte;
	    }
	  else if (MB_NULLWCH (m))
	    wc = L'\0';
	  switch (nop)
	  {
	  default:
	  case CASE_NOOP:  nwc = wc; break;
	  case CASE_UPPER:  nwc = _to_wupper (wc); break;
	  case CASE_LOWER:  nwc = _to_wlower (wc); break;
	  case CASE_TOGGLEALL:
	  case CASE_TOGGLE: nwc = TOGGLE (wc); break;
	  }
	  if  (nwc != wc)	/*  just skip unchanged characters */
	    {
	      mlen = wcrtomb (mb, nwc, &state);
	      if (mlen > 0)
		mb[mlen] = '\0';
	      /* Assume the same width */
	      strncpy (ret + start, mb, mlen);
	    }
	}
#endif

      /*  This assumes that the upper and lower case versions are the same width. */
      ADVANCE_CHAR (ret, end, start);
    }

  return ret;
}
Commit	Line	Data
29d25b54 CR	1	/* casemod.c -- functions to change case of strings */
29d25b54 CR	2
012bac39	3	/* Copyright (C) 2008,2009 Free Software Foundation, Inc.
29d25b54 CR	4
	5	This file is part of GNU Bash, the Bourne Again SHell.
	6
2e4498b3 CR	7	Bash is free software: you can redistribute it and/or modify
	8	it under the terms of the GNU General Public License as published by
	9	the Free Software Foundation, either version 3 of the License, or
	10	(at your option) any later version.
	11
	12	Bash is distributed in the hope that it will be useful,
	13	but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	GNU General Public License for more details.
	16
	17	You should have received a copy of the GNU General Public License
	18	along with Bash. If not, see <http://www.gnu.org/licenses/>.
	19	*/
29d25b54 CR	20
	21	#if defined (HAVE_CONFIG_H)
	22	# include <config.h>
	23	#endif
	24
	25	#if defined (HAVE_UNISTD_H)
	26	# include <unistd.h>
	27	#endif /* HAVE_UNISTD_H */
	28
	29	#include <stdc.h>
	30
	31	#include <bashansi.h>
	32	#include <bashintl.h>
	33	#include <bashtypes.h>
	34
	35	#include <stdio.h>
	36	#include <ctype.h>
	37	#include <xmalloc.h>
	38
49cf7828	39	#include <shmbchar.h>
29d25b54 CR	40	#include <shmbutil.h>
29d25b54 CR	41	#include <chartypes.h>
1442f67c	42	#include <typemax.h>
29d25b54 CR	43
	44	#include <glob/strmatch.h>
	45
	46	#define _to_wupper(wc) (iswlower (wc) ? towupper (wc) : (wc))
	47	#define _to_wlower(wc) (iswupper (wc) ? towlower (wc) : (wc))
	48
	49	#if !defined (HANDLE_MULTIBYTE)
	50	# define cval(s, i) ((s)[(i)])
	51	# define iswalnum(c) (isalnum(c))
	52	# define TOGGLE(x) (ISUPPER (x) ? tolower (x) : (TOUPPER (x)))
	53	#else
	54	# define TOGGLE(x) (iswupper (x) ? towlower (x) : (_to_wupper(x)))
	55	#endif
	56
	57	/* These must agree with the defines in externs.h */
e141c35a CR	58	#define CASE_NOOP 0x0000
	59	#define CASE_LOWER 0x0001
	60	#define CASE_UPPER 0x0002
	61	#define CASE_CAPITALIZE 0x0004
	62	#define CASE_UNCAP 0x0008
	63	#define CASE_TOGGLE 0x0010
	64	#define CASE_TOGGLEALL 0x0020
	65	#define CASE_UPFIRST 0x0040
	66	#define CASE_LOWFIRST 0x0080
	67
	68	#define CASE_USEWORDS 0x1000 /* modify behavior to act on words in passed string */
29d25b54 CR	69
	70	extern char substring __P((char , int, int));
	71
1442f67c CR	72	#ifndef UCHAR_MAX
	73	# define UCHAR_MAX TYPE_MAXIMUM(unsigned char)
	74	#endif
	75
29d25b54 CR	76	#if defined (HANDLE_MULTIBYTE)
	77	static wchar_t
	78	cval (s, i)
	79	char *s;
	80	int i;
	81	{
	82	size_t tmp;
	83	wchar_t wc;
	84	int l;
	85	mbstate_t mps;
	86
49cf7828	87	if (MB_CUR_MAX == 1 \|\| is_basic (s[i]))
29d25b54 CR	88	return ((wchar_t)s[i]);
	89	l = strlen (s);
	90	if (i >= (l - 1))
	91	return ((wchar_t)s[i]);
	92	memset (&mps, 0, sizeof (mbstate_t));
	93	tmp = mbrtowc (&wc, s + i, l - i, &mps);
	94	if (MB_INVALIDCH (tmp) \|\| MB_NULLWCH (tmp))
	95	return ((wchar_t)s[i]);
	96	return wc;
	97	}
	98	#endif
	99
	100	/* Modify the case of characters in STRING matching PAT based on the value of
	101	FLAGS. If PAT is null, modify the case of each character */
	102	char *
	103	sh_modcase (string, pat, flags)
	104	const char *string;
	105	char *pat;
	106	int flags;
	107	{
	108	int start, next, end;
e141c35a	109	int inword, c, nc, nop, match, usewords;
29d25b54 CR	110	char ret, s;
	111	wchar_t wc;
	112	#if defined (HANDLE_MULTIBYTE)
	113	wchar_t nwc;
	114	char mb[MB_LEN_MAX+1];
	115	int mlen;
bf6bd355	116	size_t m;
29d25b54 CR	117	mbstate_t state;
	118	#endif
	119
5f8cde23 CR	120	if (string == 0 \|\| *string == 0)
	121	{
	122	ret = (char *)xmalloc (1);
	123	ret[0] = '\0';
	124	return ret;
	125	}
	126
29d25b54 CR	127	#if defined (HANDLE_MULTIBYTE)
	128	memset (&state, 0, sizeof (mbstate_t));
	129	#endif
	130
	131	start = 0;
	132	end = strlen (string);
	133
	134	ret = (char *)xmalloc (end + 1);
	135	strcpy (ret, string);
	136
e141c35a CR	137	/* See if we are supposed to split on alphanumerics and operate on each word */
	138	usewords = (flags & CASE_USEWORDS);
	139	flags &= ~CASE_USEWORDS;
	140
29d25b54 CR	141	inword = 0;
	142	while (start < end)
	143	{
	144	wc = cval (ret, start);
	145
	146	if (iswalnum (wc) == 0)
	147	{
	148	inword = 0;
1442f67c	149	#if 0
29d25b54 CR	150	ADVANCE_CHAR (ret, end, start);
29d25b54 CR	151	continue;
1442f67c	152	#endif
29d25b54 CR	153	}
	154
	155	if (pat)
	156	{
	157	next = start;
	158	ADVANCE_CHAR (ret, end, next);
	159	s = substring (ret, start, next);
	160	match = strmatch (pat, s, FNM_EXTMATCH) != FNM_NOMATCH;
	161	free (s);
	162	if (match == 0)
	163	{
	164	start = next;
	165	inword = 1;
	166	continue;
	167	}
	168	}
	169
e141c35a CR	170	/* XXX - for now, the toggling operators work on the individual
	171	words in the string, breaking on alphanumerics. Should I
	172	leave the capitalization operators to do that also? */
29d25b54 CR	173	if (flags == CASE_CAPITALIZE)
29d25b54 CR	174	{
e141c35a CR	175	if (usewords)
	176	nop = inword ? CASE_LOWER : CASE_UPPER;
	177	else
	178	nop = (start > 0) ? CASE_LOWER : CASE_UPPER;
29d25b54 CR	179	inword = 1;
	180	}
	181	else if (flags == CASE_UNCAP)
	182	{
e141c35a CR	183	if (usewords)
	184	nop = inword ? CASE_UPPER : CASE_LOWER;
	185	else
	186	nop = (start > 0) ? CASE_UPPER : CASE_LOWER;
29d25b54 CR	187	inword = 1;
29d25b54 CR	188	}
e141c35a CR	189	else if (flags == CASE_UPFIRST)
	190	{
	191	if (usewords)
	192	nop = inword ? CASE_NOOP : CASE_UPPER;
	193	else
	194	nop = (start > 0) ? CASE_NOOP : CASE_UPPER;
	195	inword = 1;
	196	}
	197	else if (flags == CASE_LOWFIRST)
	198	{
	199	if (usewords)
	200	nop = inword ? CASE_NOOP : CASE_LOWER;
	201	else
	202	nop = (start > 0) ? CASE_NOOP : CASE_LOWER;
	203	inword = 1;
	204	}
29d25b54 CR	205	else if (flags == CASE_TOGGLE)
	206	{
	207	nop = inword ? CASE_NOOP : CASE_TOGGLE;
	208	inword = 1;
	209	}
	210	else
	211	nop = flags;
	212
1442f67c CR	213	/* Need to check UCHAR_MAX since wc may have already been converted to a
	214	wide character by cval() */
	215	if (MB_CUR_MAX == 1 \|\| (wc <= UCHAR_MAX && is_basic ((int)wc)))
29d25b54	216	{
1442f67c	217	singlebyte:
29d25b54 CR	218	switch (nop)
29d25b54 CR	219	{
dc9f44b3	220	default:
29d25b54 CR	221	case CASE_NOOP: nc = wc; break;
	222	case CASE_UPPER: nc = TOUPPER (wc); break;
	223	case CASE_LOWER: nc = TOLOWER (wc); break;
	224	case CASE_TOGGLEALL:
	225	case CASE_TOGGLE: nc = TOGGLE (wc); break;
	226	}
	227	ret[start] = nc;
	228	}
	229	#if defined (HANDLE_MULTIBYTE)
	230	else
	231	{
bf6bd355 CR	232	m = mbrtowc (&wc, string + start, end - start, &state);
bf6bd355 CR	233	if (MB_INVALIDCH (m))
1442f67c CR	234	{
	235	wc = (unsigned char)string[start];
	236	goto singlebyte;
	237	}
bf6bd355 CR	238	else if (MB_NULLWCH (m))
bf6bd355 CR	239	wc = L'\0';
29d25b54 CR	240	switch (nop)
29d25b54 CR	241	{
dc9f44b3	242	default:
29d25b54	243	case CASE_NOOP: nwc = wc; break;
49cf7828 CR	244	case CASE_UPPER: nwc = _to_wupper (wc); break;
49cf7828 CR	245	case CASE_LOWER: nwc = _to_wlower (wc); break;
29d25b54 CR	246	case CASE_TOGGLEALL:
	247	case CASE_TOGGLE: nwc = TOGGLE (wc); break;
	248	}
	249	if (nwc != wc) /* just skip unchanged characters */
	250	{
	251	mlen = wcrtomb (mb, nwc, &state);
	252	if (mlen > 0)
	253	mb[mlen] = '\0';
	254	/* Assume the same width */
	255	strncpy (ret + start, mb, mlen);
	256	}
	257	}
	258	#endif
	259
	260	/* This assumes that the upper and lower case versions are the same width. */
	261	ADVANCE_CHAR (ret, end, start);
	262	}
	263
	264	return ret;
	265	}