[thirdparty/systemd.git] / src / shared / utf8.c

/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/

/***
  This file is part of systemd.

  Copyright 2008-2011 Kay Sievers
  Copyright 2012 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

/* Parts of this file are based on the GLIB utf8 validation functions. The
 * original license text follows. */

/* gutf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include <errno.h>
#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <stdbool.h>

#include "utf8.h"
#include "util.h"

static inline bool is_unicode_valid(uint32_t ch) {

        if (ch >= 0x110000) /* End of unicode space */
                return false;
        if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
                return false;
        if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
                return false;
        if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
                return false;

        return true;
}

static bool is_unicode_control(uint32_t ch) {

        /*
          0 to ' '-1 is the C0 range.
          DEL=0x7F, and DEL+1 to 0x9F is C1 range.
          '\t' is in C0 range, but more or less harmless and commonly used.
        */

        return (ch < ' ' && ch != '\t' && ch != '\n') ||
                (0x7F <= ch && ch <= 0x9F);
}

/* count of characters used to encode one unicode char */
static int utf8_encoded_expected_len(const char *str) {
        unsigned char c = (unsigned char)str[0];

        if (c < 0x80)
                return 1;
        if ((c & 0xe0) == 0xc0)
                return 2;
        if ((c & 0xf0) == 0xe0)
                return 3;
        if ((c & 0xf8) == 0xf0)
                return 4;
        if ((c & 0xfc) == 0xf8)
                return 5;
        if ((c & 0xfe) == 0xfc)
                return 6;
        return 0;
}

/* decode one unicode char */
int utf8_encoded_to_unichar(const char *str) {
        int unichar;
        int len;
        int i;

        len = utf8_encoded_expected_len(str);
        switch (len) {
        case 1:
                return (int)str[0];
        case 2:
                unichar = str[0] & 0x1f;
                break;
        case 3:
                unichar = (int)str[0] & 0x0f;
                break;
        case 4:
                unichar = (int)str[0] & 0x07;
                break;
        case 5:
                unichar = (int)str[0] & 0x03;
                break;
        case 6:
                unichar = (int)str[0] & 0x01;
                break;
        default:
                return -1;
        }

        for (i = 1; i < len; i++) {
                if (((int)str[i] & 0xc0) != 0x80)
                        return -1;
                unichar <<= 6;
                unichar |= (int)str[i] & 0x3f;
        }

        return unichar;
}

bool utf8_is_printable(const char* str, size_t length) {
        const uint8_t *p;

        assert(str);

        for (p = (const uint8_t*) str; length;) {
                int encoded_len = utf8_encoded_valid_unichar((const char *)p);
                int val = utf8_encoded_to_unichar((const char*)p);

                if (encoded_len < 0 || val < 0 || is_unicode_control(val))
                        return false;

                length -= encoded_len;
                p += encoded_len;
        }

        return true;
}

const char *utf8_is_valid(const char *str) {
        const uint8_t *p;

        assert(str);

        for (p = (const uint8_t*) str; *p; ) {
                int len;

                len = utf8_encoded_valid_unichar((const char *)p);

                if (len < 0)
                        return NULL;

                p += len;
        }

        return str;
}

char *utf8_escape_invalid(const char *str) {
        char *p, *s;

        assert(str);

        p = s = malloc(strlen(str) * 4 + 1);
        if (!p)
                return NULL;

        while (*str) {
                int len;

                len = utf8_encoded_valid_unichar(str);
                if (len > 0) {
                        s = mempcpy(s, str, len);
                        str += len;
                } else {
                        s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER));
                        str += 1;
                }
        }
        *s = '\0';

        return p;
}

char *ascii_is_valid(const char *str) {
        const char *p;

        assert(str);

        for (p = str; *p; p++)
                if ((unsigned char) *p >= 128)
                        return NULL;

        return (char*) str;
}

char *utf16_to_utf8(const void *s, size_t length) {
        char *r;
        const uint8_t *f;
        uint8_t *t;

        r = new(char, (length*3+1)/2 + 1);
        if (!r)
                return NULL;

        t = (uint8_t*) r;

        for (f = s; f < (const uint8_t*) s + length; f += 2) {
                uint16_t c;

                c = (f[1] << 8) | f[0];

                if (c == 0) {
                        *t = 0;
                        return r;
                } else if (c < 0x80) {
                        *(t++) = (uint8_t) c;
                } else if (c < 0x800) {
                        *(t++) = (uint8_t) (0xc0 | (c >> 6));
                        *(t++) = (uint8_t) (0x80 | (c & 0x3f));
                } else {
                        *(t++) = (uint8_t) (0xe0 | (c >> 12));
                        *(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
                        *(t++) = (uint8_t) (0x80 | (c & 0x3f));
                }
        }

        *t = 0;

        return r;
}

/* expected size used to encode one unicode char */
static int utf8_unichar_to_encoded_len(int unichar) {
        if (unichar < 0x80)
                return 1;
        if (unichar < 0x800)
                return 2;
        if (unichar < 0x10000)
                return 3;
        if (unichar < 0x200000)
                return 4;
        if (unichar < 0x4000000)
                return 5;
        return 6;
}

/* validate one encoded unicode char and return its length */
int utf8_encoded_valid_unichar(const char *str) {
        int len;
        int unichar;
        int i;

        len = utf8_encoded_expected_len(str);
        if (len == 0)
                return -1;

        /* ascii is valid */
        if (len == 1)
                return 1;

        /* check if expected encoded chars are available */
        for (i = 0; i < len; i++)
                if ((str[i] & 0x80) != 0x80)
                        return -1;

        unichar = utf8_encoded_to_unichar(str);

        /* check if encoded length matches encoded value */
        if (utf8_unichar_to_encoded_len(unichar) != len)
                return -1;

        /* check if value has valid range */
        if (!is_unicode_valid(unichar))
                return -1;

        return len;
}
Commit	Line	Data
7f110ff9 LP	1	/-- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil --/
	2
	3	/***
	4	This file is part of systemd.
	5
036ae95a	6	Copyright 2008-2011 Kay Sievers
7f110ff9 LP	7	Copyright 2012 Lennart Poettering
	8
	9	systemd is free software; you can redistribute it and/or modify it
5430f7f2 LP	10	under the terms of the GNU Lesser General Public License as published by
5430f7f2 LP	11	the Free Software Foundation; either version 2.1 of the License, or
7f110ff9 LP	12	(at your option) any later version.
	13
	14	systemd is distributed in the hope that it will be useful, but
	15	WITHOUT ANY WARRANTY; without even the implied warranty of
	16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2	17	Lesser General Public License for more details.
7f110ff9	18
5430f7f2	19	You should have received a copy of the GNU Lesser General Public License
7f110ff9 LP	20	along with systemd; If not, see <http://www.gnu.org/licenses/>.
	21	***/
	22
036ae95a	23	/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9 LP	24	* original license text follows. */
	25
	26	/* gutf8.c - Operations on UTF-8 strings.
	27	*
	28	* Copyright (C) 1999 Tom Tromey
	29	* Copyright (C) 2000 Red Hat, Inc.
	30	*
	31	* This library is free software; you can redistribute it and/or
23757887	32	* modify it under the terms of the GNU Library General Public
7f110ff9 LP	33	* License as published by the Free Software Foundation; either
	34	* version 2 of the License, or (at your option) any later version.
	35	*
	36	* This library is distributed in the hope that it will be useful,
	37	* but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887 SK	38	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23757887 SK	39	* Library General Public License for more details.
7f110ff9	40	*
23757887 SK	41	* You should have received a copy of the GNU Library General Public
	42	* License along with this library; if not, write to the Free Software
	43	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9 LP	44	*/
	45
	46	#include <errno.h>
	47	#include <stdlib.h>
	48	#include <inttypes.h>
	49	#include <string.h>
	50	#include <stdbool.h>
	51
	52	#include "utf8.h"
2e3d0692	53	#include "util.h"
7f110ff9	54
7f110ff9 LP	55	static inline bool is_unicode_valid(uint32_t ch) {
	56
	57	if (ch >= 0x110000) /* End of unicode space */
	58	return false;
	59	if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
	60	return false;
	61	if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
	62	return false;
	63	if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
	64	return false;
	65
	66	return true;
	67	}
	68
ba961854 ZJS	69	static bool is_unicode_control(uint32_t ch) {
	70
	71	/*
	72	0 to ' '-1 is the C0 range.
	73	DEL=0x7F, and DEL+1 to 0x9F is C1 range.
	74	'\t' is in C0 range, but more or less harmless and commonly used.
	75	*/
	76
31f7bf19	77	return (ch < ' ' && ch != '\t' && ch != '\n') \|\|
ba961854 ZJS	78	(0x7F <= ch && ch <= 0x9F);
	79	}
	80
7991ac34 DR	81	/* count of characters used to encode one unicode char */
	82	static int utf8_encoded_expected_len(const char *str) {
	83	unsigned char c = (unsigned char)str[0];
ba961854	84
7991ac34 DR	85	if (c < 0x80)
	86	return 1;
	87	if ((c & 0xe0) == 0xc0)
	88	return 2;
	89	if ((c & 0xf0) == 0xe0)
	90	return 3;
	91	if ((c & 0xf8) == 0xf0)
	92	return 4;
	93	if ((c & 0xfc) == 0xf8)
	94	return 5;
	95	if ((c & 0xfe) == 0xfc)
	96	return 6;
	97	return 0;
	98	}
ba961854	99
7991ac34	100	/* decode one unicode char */
f405e86d	101	int utf8_encoded_to_unichar(const char *str) {
7991ac34 DR	102	int unichar;
	103	int len;
	104	int i;
ba961854	105
7991ac34 DR	106	len = utf8_encoded_expected_len(str);
	107	switch (len) {
	108	case 1:
	109	return (int)str[0];
	110	case 2:
	111	unichar = str[0] & 0x1f;
	112	break;
	113	case 3:
	114	unichar = (int)str[0] & 0x0f;
	115	break;
	116	case 4:
	117	unichar = (int)str[0] & 0x07;
	118	break;
	119	case 5:
	120	unichar = (int)str[0] & 0x03;
	121	break;
	122	case 6:
	123	unichar = (int)str[0] & 0x01;
	124	break;
	125	default:
	126	return -1;
ba961854 ZJS	127	}
ba961854 ZJS	128
7991ac34 DR	129	for (i = 1; i < len; i++) {
	130	if (((int)str[i] & 0xc0) != 0x80)
	131	return -1;
	132	unichar <<= 6;
	133	unichar \|= (int)str[i] & 0x3f;
	134	}
	135
	136	return unichar;
ba961854 ZJS	137	}
ba961854 ZJS	138
7991ac34 DR	139	bool utf8_is_printable(const char* str, size_t length) {
7991ac34 DR	140	const uint8_t *p;
7f110ff9 LP	141
	142	assert(str);
	143
a7176505	144	for (p = (const uint8_t*) str; length;) {
7991ac34	145	int encoded_len = utf8_encoded_valid_unichar((const char *)p);
a7176505	146	int val = utf8_encoded_to_unichar((const char*)p);
7f110ff9	147
7991ac34 DR	148	if (encoded_len < 0 \|\| val < 0 \|\| is_unicode_control(val))
7991ac34 DR	149	return false;
7f110ff9	150
7991ac34	151	length -= encoded_len;
a7176505	152	p += encoded_len;
7f110ff9 LP	153	}
7f110ff9 LP	154
7991ac34	155	return true;
7f110ff9 LP	156	}
7f110ff9 LP	157
7991ac34 DR	158	const char utf8_is_valid(const char str) {
7991ac34 DR	159	const uint8_t *p;
7f110ff9 LP	160
	161	assert(str);
	162
7991ac34	163	for (p = (const uint8_t) str; p; ) {
faaa5728 LP	164	int len;
	165
	166	len = utf8_encoded_valid_unichar((const char *)p);
7991ac34 DR	167
	168	if (len < 0)
	169	return NULL;
	170
	171	p += len;
	172	}
7f110ff9	173
7991ac34	174	return str;
7f110ff9 LP	175	}
7f110ff9 LP	176
550a40ec ZJS	177	char utf8_escape_invalid(const char str) {
	178	char p, s;
	179
	180	assert(str);
	181
	182	p = s = malloc(strlen(str) * 4 + 1);
	183	if (!p)
	184	return NULL;
	185
	186	while (*str) {
	187	int len;
	188
	189	len = utf8_encoded_valid_unichar(str);
	190	if (len > 0) {
	191	s = mempcpy(s, str, len);
	192	str += len;
	193	} else {
	194	s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER));
	195	str += 1;
	196	}
	197	}
	198	*s = '\0';
	199
	200	return p;
	201	}
	202
7f110ff9 LP	203	char ascii_is_valid(const char str) {
	204	const char *p;
	205
	206	assert(str);
	207
	208	for (p = str; *p; p++)
	209	if ((unsigned char) *p >= 128)
	210	return NULL;
	211
	212	return (char*) str;
	213	}
	214
2e3d0692 LP	215	char utf16_to_utf8(const void s, size_t length) {
	216	char *r;
	217	const uint8_t *f;
	218	uint8_t *t;
	219
	220	r = new(char, (length*3+1)/2 + 1);
	221	if (!r)
	222	return NULL;
	223
	224	t = (uint8_t*) r;
	225
	226	for (f = s; f < (const uint8_t*) s + length; f += 2) {
	227	uint16_t c;
	228
	229	c = (f[1] << 8) \| f[0];
	230
	231	if (c == 0) {
	232	*t = 0;
	233	return r;
	234	} else if (c < 0x80) {
	235	*(t++) = (uint8_t) c;
	236	} else if (c < 0x800) {
	237	*(t++) = (uint8_t) (0xc0 \| (c >> 6));
	238	*(t++) = (uint8_t) (0x80 \| (c & 0x3f));
	239	} else {
	240	*(t++) = (uint8_t) (0xe0 \| (c >> 12));
	241	*(t++) = (uint8_t) (0x80 \| ((c >> 6) & 0x3f));
	242	*(t++) = (uint8_t) (0x80 \| (c & 0x3f));
	243	}
	244	}
	245
	246	*t = 0;
2e3d0692	247
7b4d7cc0	248	return r;
2e3d0692	249	}
02a36bc9	250
02a36bc9 DR	251	/* expected size used to encode one unicode char */
	252	static int utf8_unichar_to_encoded_len(int unichar) {
	253	if (unichar < 0x80)
	254	return 1;
	255	if (unichar < 0x800)
	256	return 2;
	257	if (unichar < 0x10000)
	258	return 3;
	259	if (unichar < 0x200000)
	260	return 4;
	261	if (unichar < 0x4000000)
	262	return 5;
	263	return 6;
	264	}
	265
	266	/* validate one encoded unicode char and return its length */
	267	int utf8_encoded_valid_unichar(const char *str) {
	268	int len;
	269	int unichar;
	270	int i;
	271
	272	len = utf8_encoded_expected_len(str);
	273	if (len == 0)
	274	return -1;
	275
	276	/* ascii is valid */
	277	if (len == 1)
	278	return 1;
	279
	280	/* check if expected encoded chars are available */
	281	for (i = 0; i < len; i++)
	282	if ((str[i] & 0x80) != 0x80)
	283	return -1;
	284
	285	unichar = utf8_encoded_to_unichar(str);
	286
	287	/* check if encoded length matches encoded value */
	288	if (utf8_unichar_to_encoded_len(unichar) != len)
	289	return -1;
	290
	291	/* check if value has valid range */
	292	if (!is_unicode_valid(unichar))
	293	return -1;
	294
	295	return len;
	296	}