[thirdparty/util-linux.git] / libblkid / src / encode.c


/*
 * encode.c - string conversion routines (mostly for compatibility with
 *            udev/volume_id)
 *
 * Copyright (C) 2008 Kay Sievers <kay.sievers@vrfy.org>
 * Copyright (C) 2009 Karel Zak <kzak@redhat.com>
 *
 * This file may be redistributed under the terms of the
 * GNU Lesser General Public License.
 */
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <ctype.h>

#include "blkidP.h"

#define UDEV_ALLOWED_CHARS_INPUT               "/ $%?,"

/**
 * SECTION: encode
 * @title: Encoding utils
 * @short_description: encode strings to safe udev-compatible formats
 *
 */

/* count of characters used to encode one unicode char */
static int utf8_encoded_expected_len(const char *str)
{
	unsigned char c = (unsigned char)str[0];

	if (c < 0x80)
		return 1;
	if ((c & 0xe0) == 0xc0)
		return 2;
	if ((c & 0xf0) == 0xe0)
		return 3;
	if ((c & 0xf8) == 0xf0)
		return 4;
	if ((c & 0xfc) == 0xf8)
		return 5;
	if ((c & 0xfe) == 0xfc)
		return 6;
	return 0;
}

/* decode one unicode char */
static int utf8_encoded_to_unichar(const char *str)
{
	int unichar;
	int len;
	int i;

	len = utf8_encoded_expected_len(str);
	switch (len) {
	case 1:
		return (int)str[0];
	case 2:
		unichar = str[0] & 0x1f;
		break;
	case 3:
		unichar = (int)str[0] & 0x0f;
		break;
	case 4:
		unichar = (int)str[0] & 0x07;
		break;
	case 5:
		unichar = (int)str[0] & 0x03;
		break;
	case 6:
		unichar = (int)str[0] & 0x01;
		break;
	default:
		return -1;
	}

	for (i = 1; i < len; i++) {
		if (((int)str[i] & 0xc0) != 0x80)
			return -1;
		unichar <<= 6;
		unichar |= (int)str[i] & 0x3f;
	}

	return unichar;
}

/* expected size used to encode one unicode char */
static int utf8_unichar_to_encoded_len(int unichar)
{
	if (unichar < 0x80)
		return 1;
	if (unichar < 0x800)
		return 2;
	if (unichar < 0x10000)
		return 3;
	if (unichar < 0x200000)
		return 4;
	if (unichar < 0x4000000)
		return 5;
	return 6;
}

/* check if unicode char has a valid numeric range */
static int utf8_unichar_valid_range(int unichar)
{
	if (unichar > 0x10ffff)
		return 0;
	if ((unichar & 0xfffff800) == 0xd800)
		return 0;
	if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
		return 0;
	if ((unichar & 0xffff) == 0xffff)
		return 0;
	return 1;
}

/* validate one encoded unicode char and return its length */
static int utf8_encoded_valid_unichar(const char *str)
{
	int len;
	int unichar;
	int i;

	len = utf8_encoded_expected_len(str);
	if (len == 0)
		return -1;

	/* ascii is valid */
	if (len == 1)
		return 1;

	/* check if expected encoded chars are available */
	for (i = 0; i < len; i++)
		if ((str[i] & 0x80) != 0x80)
			return -1;

	unichar = utf8_encoded_to_unichar(str);

	/* check if encoded length matches encoded value */
	if (utf8_unichar_to_encoded_len(unichar) != len)
		return -1;

	/* check if value has valid range */
	if (!utf8_unichar_valid_range(unichar))
		return -1;

	return len;
}

static int replace_whitespace(const char *str, char *to, size_t len)
{
	size_t i, j;

	/* strip trailing whitespace */
	len = strnlen(str, len);
	while (len && isspace(str[len-1]))
		len--;

	/* strip leading whitespace */
	i = 0;
	while ((i < len) && isspace(str[i]))
		i++;

	j = 0;
	while (i < len) {
		/* substitute multiple whitespace with a single '_' */
		if (isspace(str[i])) {
			while (isspace(str[i]))
				i++;
			to[j++] = '_';
		}
		to[j++] = str[i++];
	}
	to[j] = '\0';
	return 0;
}

static int is_whitelisted(char c, const char *white)
{
	if ((c >= '0' && c <= '9') ||
	    (c >= 'A' && c <= 'Z') ||
	    (c >= 'a' && c <= 'z') ||
	    strchr("#+-.:=@_", c) != NULL ||
	    (white != NULL && strchr(white, c) != NULL))
		return 1;
	return 0;
}

/* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */
static int replace_chars(char *str, const char *white)
{
	size_t i = 0;
	int replaced = 0;

	while (str[i] != '\0') {
		int len;

		if (is_whitelisted(str[i], white)) {
			i++;
			continue;
		}

		/* accept hex encoding */
		if (str[i] == '\\' && str[i+1] == 'x') {
			i += 2;
			continue;
		}

		/* accept valid utf8 */
		len = utf8_encoded_valid_unichar(&str[i]);
		if (len > 1) {
			i += len;
			continue;
		}

		/* if space is allowed, replace whitespace with ordinary space */
		if (isspace(str[i]) && white != NULL && strchr(white, ' ') != NULL) {
			str[i] = ' ';
			i++;
			replaced++;
			continue;
		}

		/* everything else is replaced with '_' */
		str[i] = '_';
		i++;
		replaced++;
	}
	return replaced;
}

/**
 * blkid_encode_string:
 * @str: input string to be encoded
 * @str_enc: output string to store the encoded input string
 * @len: maximum size of the output string, which may be
 *       four times as long as the input string
 *
 * Encode all potentially unsafe characters of a string to the
 * corresponding hex value prefixed by '\x'.
 *
 * Returns: 0 if the entire string was copied, non-zero otherwise.
 **/
int blkid_encode_string(const char *str, char *str_enc, size_t len)
{
	size_t i, j;

	if (!str || !str_enc || !len)
		return -1;

	for (i = 0, j = 0; str[i] != '\0'; i++) {
		int seqlen;

		seqlen = utf8_encoded_valid_unichar(&str[i]);
		if (seqlen > 1) {
			if (len-j < (size_t)seqlen)
				goto err;
			memcpy(&str_enc[j], &str[i], seqlen);
			j += seqlen;
			i += (seqlen-1);
		} else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) {
			if (len-j < 4)
				goto err;
			sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
			j += 4;
		} else {
			if (len-j < 1)
				goto err;
			str_enc[j] = str[i];
			j++;
		}
		if (j+3 >= len)
			goto err;
	}
	if (len-j < 1)
		goto err;
	str_enc[j] = '\0';
	return 0;
err:
	return -1;
}

/**
 * blkid_safe_string:
 * @str: input string
 * @str_safe: output string
 * @len: size of output string
 *
 * Allows plain ascii, hex-escaping and valid utf8. Replaces all whitespaces
 * with '_'.
 *
 * Returns: 0 on success or -1 in case of error.
 */
int blkid_safe_string(const char *str, char *str_safe, size_t len)
{
	if (!str || !str_safe || !len)
		return -1;
	replace_whitespace(str, str_safe, len);
	replace_chars(str_safe, UDEV_ALLOWED_CHARS_INPUT);
	return 0;
}
Commit	Line	Data
dd20a05a KZ	1
dd20a05a KZ	2	/*
455fe9a0	3	* encode.c - string conversion routines (mostly for compatibility with
dd20a05a KZ	4	* udev/volume_id)
	5	*
	6	* Copyright (C) 2008 Kay Sievers <kay.sievers@vrfy.org>
	7	* Copyright (C) 2009 Karel Zak <kzak@redhat.com>
	8	*
	9	* This file may be redistributed under the terms of the
	10	* GNU Lesser General Public License.
	11	*/
	12	#include <stdio.h>
	13	#include <stdlib.h>
	14	#include <stddef.h>
	15	#include <unistd.h>
	16	#include <errno.h>
	17	#include <string.h>
	18	#include <ctype.h>
	19
dd20a05a KZ	20	#include "blkidP.h"
	21
	22	#define UDEV_ALLOWED_CHARS_INPUT "/ $%?,"
	23
488e52be KZ	24	/**
	25	* SECTION: encode
	26	* @title: Encoding utils
	27	* @short_description: encode strings to safe udev-compatible formats
	28	*
	29	*/
	30
dd20a05a KZ	31	/* count of characters used to encode one unicode char */
	32	static int utf8_encoded_expected_len(const char *str)
	33	{
	34	unsigned char c = (unsigned char)str[0];
	35
	36	if (c < 0x80)
	37	return 1;
	38	if ((c & 0xe0) == 0xc0)
	39	return 2;
	40	if ((c & 0xf0) == 0xe0)
	41	return 3;
	42	if ((c & 0xf8) == 0xf0)
	43	return 4;
	44	if ((c & 0xfc) == 0xf8)
	45	return 5;
	46	if ((c & 0xfe) == 0xfc)
	47	return 6;
	48	return 0;
	49	}
	50
	51	/* decode one unicode char */
	52	static int utf8_encoded_to_unichar(const char *str)
	53	{
	54	int unichar;
	55	int len;
	56	int i;
	57
	58	len = utf8_encoded_expected_len(str);
	59	switch (len) {
	60	case 1:
	61	return (int)str[0];
	62	case 2:
	63	unichar = str[0] & 0x1f;
	64	break;
	65	case 3:
	66	unichar = (int)str[0] & 0x0f;
	67	break;
	68	case 4:
	69	unichar = (int)str[0] & 0x07;
	70	break;
	71	case 5:
	72	unichar = (int)str[0] & 0x03;
	73	break;
	74	case 6:
	75	unichar = (int)str[0] & 0x01;
	76	break;
	77	default:
	78	return -1;
	79	}
	80
	81	for (i = 1; i < len; i++) {
	82	if (((int)str[i] & 0xc0) != 0x80)
	83	return -1;
	84	unichar <<= 6;
	85	unichar \|= (int)str[i] & 0x3f;
	86	}
	87
	88	return unichar;
	89	}
	90
	91	/* expected size used to encode one unicode char */
	92	static int utf8_unichar_to_encoded_len(int unichar)
	93	{
	94	if (unichar < 0x80)
95	return 1;
96	if (unichar < 0x800)
97	return 2;
98	if (unichar < 0x10000)
99	return 3;
100	if (unichar < 0x200000)
101	return 4;
102	if (unichar < 0x4000000)
103	return 5;
104	return 6;
105	}
106
107	/* check if unicode char has a valid numeric range */
108	static int utf8_unichar_valid_range(int unichar)
109	{
110	if (unichar > 0x10ffff)
111	return 0;
112	if ((unichar & 0xfffff800) == 0xd800)
113	return 0;
114	if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
115	return 0;
116	if ((unichar & 0xffff) == 0xffff)
117	return 0;
118	return 1;
119	}
120
121	/* validate one encoded unicode char and return its length */
122	static int utf8_encoded_valid_unichar(const char *str)
123	{
124	int len;
125	int unichar;
126	int i;
127
128	len = utf8_encoded_expected_len(str);
129	if (len == 0)
130	return -1;
131
132	/* ascii is valid */
133	if (len == 1)
134	return 1;
135
136	/* check if expected encoded chars are available */
137	for (i = 0; i < len; i++)
138	if ((str[i] & 0x80) != 0x80)
139	return -1;
140
141	unichar = utf8_encoded_to_unichar(str);
142
143	/* check if encoded length matches encoded value */
144	if (utf8_unichar_to_encoded_len(unichar) != len)
145	return -1;
146
147	/* check if value has valid range */
148	if (!utf8_unichar_valid_range(unichar))
149	return -1;
150
151	return len;
152	}
153
154	static int replace_whitespace(const char str, char to, size_t len)
155	{
156	size_t i, j;
157
158	/* strip trailing whitespace */
159	len = strnlen(str, len);
160	while (len && isspace(str[len-1]))
161	len--;
162
163	/* strip leading whitespace */
164	i = 0;
0cd7ecef	165	while ((i < len) && isspace(str[i]))
dd20a05a KZ	166	i++;
	167
	168	j = 0;
	169	while (i < len) {
	170	/* substitute multiple whitespace with a single '_' */
	171	if (isspace(str[i])) {
	172	while (isspace(str[i]))
	173	i++;
	174	to[j++] = '_';
	175	}
	176	to[j++] = str[i++];
	177	}
	178	to[j] = '\0';
	179	return 0;
	180	}
	181
	182	static int is_whitelisted(char c, const char *white)
	183	{
	184	if ((c >= '0' && c <= '9') \|\|
	185	(c >= 'A' && c <= 'Z') \|\|
	186	(c >= 'a' && c <= 'z') \|\|
	187	strchr("#+-.:=@_", c) != NULL \|\|
	188	(white != NULL && strchr(white, c) != NULL))
	189	return 1;
	190	return 0;
	191	}
	192
	193	/* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */
	194	static int replace_chars(char str, const char white)
	195	{
	196	size_t i = 0;
	197	int replaced = 0;
	198
	199	while (str[i] != '\0') {
	200	int len;
	201
	202	if (is_whitelisted(str[i], white)) {
	203	i++;
	204	continue;
	205	}
	206
	207	/* accept hex encoding */
	208	if (str[i] == '\\' && str[i+1] == 'x') {
	209	i += 2;
	210	continue;
	211	}
	212
	213	/* accept valid utf8 */
	214	len = utf8_encoded_valid_unichar(&str[i]);
	215	if (len > 1) {
	216	i += len;
	217	continue;
	218	}
	219
	220	/* if space is allowed, replace whitespace with ordinary space */
	221	if (isspace(str[i]) && white != NULL && strchr(white, ' ') != NULL) {
	222	str[i] = ' ';
	223	i++;
	224	replaced++;
	225	continue;
	226	}
	227
	228	/* everything else is replaced with '_' */
	229	str[i] = '_';
230	i++;
231	replaced++;
232	}
233	return replaced;
234	}
235
236	/**
237	* blkid_encode_string:
238	* @str: input string to be encoded
239	* @str_enc: output string to store the encoded input string
240	* @len: maximum size of the output string, which may be
241	* four times as long as the input string
242	*
243	* Encode all potentially unsafe characters of a string to the
244	* corresponding hex value prefixed by '\x'.
245	*
246	* Returns: 0 if the entire string was copied, non-zero otherwise.
247	**/
248	int blkid_encode_string(const char str, char str_enc, size_t len)
249	{
250	size_t i, j;
251
e3436956	252	if (!str \|\| !str_enc \|\| !len)
dd20a05a KZ	253	return -1;
dd20a05a KZ	254
dd20a05a KZ	255	for (i = 0, j = 0; str[i] != '\0'; i++) {
	256	int seqlen;
	257
	258	seqlen = utf8_encoded_valid_unichar(&str[i]);
	259	if (seqlen > 1) {
3096d61a FZ	260	if (len-j < (size_t)seqlen)
3096d61a FZ	261	goto err;
dd20a05a KZ	262	memcpy(&str_enc[j], &str[i], seqlen);
	263	j += seqlen;
	264	i += (seqlen-1);
	265	} else if (str[i] == '\\' \|\| !is_whitelisted(str[i], NULL)) {
3096d61a FZ	266	if (len-j < 4)
3096d61a FZ	267	goto err;
dd20a05a KZ	268	sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
	269	j += 4;
	270	} else {
3096d61a FZ	271	if (len-j < 1)
3096d61a FZ	272	goto err;
dd20a05a KZ	273	str_enc[j] = str[i];
	274	j++;
	275	}
	276	if (j+3 >= len)
	277	goto err;
	278	}
3096d61a FZ	279	if (len-j < 1)
3096d61a FZ	280	goto err;
dd20a05a KZ	281	str_enc[j] = '\0';
	282	return 0;
	283	err:
	284	return -1;
	285	}
	286
	287	/**
	288	* blkid_safe_string:
	289	* @str: input string
	290	* @str_safe: output string
	291	* @len: size of output string
	292	*
	293	* Allows plain ascii, hex-escaping and valid utf8. Replaces all whitespaces
	294	* with '_'.
488e52be KZ	295	*
488e52be KZ	296	* Returns: 0 on success or -1 in case of error.
dd20a05a KZ	297	*/
	298	int blkid_safe_string(const char str, char str_safe, size_t len)
	299	{
e3436956 KZ	300	if (!str \|\| !str_safe \|\| !len)
e3436956 KZ	301	return -1;
17d6fe2e	302	replace_whitespace(str, str_safe, len);
dd20a05a KZ	303	replace_chars(str_safe, UDEV_ALLOWED_CHARS_INPUT);
	304	return 0;
	305	}