[thirdparty/systemd.git] / src / basic / utf8.c

/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/

/***
  This file is part of systemd.

  Copyright 2008-2011 Kay Sievers
  Copyright 2012 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

/* Parts of this file are based on the GLIB utf8 validation functions. The
 * original license text follows. */

/* gutf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

#include <errno.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

#include "alloc-util.h"
#include "hexdecoct.h"
#include "utf8.h"
#include "util.h"

bool unichar_is_valid(uint32_t ch) {

        if (ch >= 0x110000) /* End of unicode space */
                return false;
        if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
                return false;
        if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
                return false;
        if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
                return false;

        return true;
}

static bool unichar_is_control(uint32_t ch) {

        /*
          0 to ' '-1 is the C0 range.
          DEL=0x7F, and DEL+1 to 0x9F is C1 range.
          '\t' is in C0 range, but more or less harmless and commonly used.
        */

        return (ch < ' ' && ch != '\t' && ch != '\n') ||
                (0x7F <= ch && ch <= 0x9F);
}

/* count of characters used to encode one unicode char */
static int utf8_encoded_expected_len(const char *str) {
        unsigned char c;

        assert(str);

        c = (unsigned char) str[0];
        if (c < 0x80)
                return 1;
        if ((c & 0xe0) == 0xc0)
                return 2;
        if ((c & 0xf0) == 0xe0)
                return 3;
        if ((c & 0xf8) == 0xf0)
                return 4;
        if ((c & 0xfc) == 0xf8)
                return 5;
        if ((c & 0xfe) == 0xfc)
                return 6;

        return 0;
}

/* decode one unicode char */
int utf8_encoded_to_unichar(const char *str) {
        int unichar, len, i;

        assert(str);

        len = utf8_encoded_expected_len(str);

        switch (len) {
        case 1:
                return (int)str[0];
        case 2:
                unichar = str[0] & 0x1f;
                break;
        case 3:
                unichar = (int)str[0] & 0x0f;
                break;
        case 4:
                unichar = (int)str[0] & 0x07;
                break;
        case 5:
                unichar = (int)str[0] & 0x03;
                break;
        case 6:
                unichar = (int)str[0] & 0x01;
                break;
        default:
                return -EINVAL;
        }

        for (i = 1; i < len; i++) {
                if (((int)str[i] & 0xc0) != 0x80)
                        return -EINVAL;
                unichar <<= 6;
                unichar |= (int)str[i] & 0x3f;
        }

        return unichar;
}

bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
        const char *p;

        assert(str);

        for (p = str; length;) {
                int encoded_len, val;

                encoded_len = utf8_encoded_valid_unichar(p);
                if (encoded_len < 0 ||
                    (size_t) encoded_len > length)
                        return false;

                val = utf8_encoded_to_unichar(p);
                if (val < 0 ||
                    unichar_is_control(val) ||
                    (!newline && val == '\n'))
                        return false;

                length -= encoded_len;
                p += encoded_len;
        }

        return true;
}

const char *utf8_is_valid(const char *str) {
        const uint8_t *p;

        assert(str);

        for (p = (const uint8_t*) str; *p; ) {
                int len;

                len = utf8_encoded_valid_unichar((const char *)p);
                if (len < 0)
                        return NULL;

                p += len;
        }

        return str;
}

char *utf8_escape_invalid(const char *str) {
        char *p, *s;

        assert(str);

        p = s = malloc(strlen(str) * 4 + 1);
        if (!p)
                return NULL;

        while (*str) {
                int len;

                len = utf8_encoded_valid_unichar(str);
                if (len > 0) {
                        s = mempcpy(s, str, len);
                        str += len;
                } else {
                        s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
                        str += 1;
                }
        }

        *s = '\0';

        return p;
}

char *utf8_escape_non_printable(const char *str) {
        char *p, *s;

        assert(str);

        p = s = malloc(strlen(str) * 4 + 1);
        if (!p)
                return NULL;

        while (*str) {
                int len;

                len = utf8_encoded_valid_unichar(str);
                if (len > 0) {
                        if (utf8_is_printable(str, len)) {
                                s = mempcpy(s, str, len);
                                str += len;
                        } else {
                                while (len > 0) {
                                        *(s++) = '\\';
                                        *(s++) = 'x';
                                        *(s++) = hexchar((int) *str >> 4);
                                        *(s++) = hexchar((int) *str);

                                        str += 1;
                                        len --;
                                }
                        }
                } else {
                        s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
                        str += 1;
                }
        }

        *s = '\0';

        return p;
}

char *ascii_is_valid(const char *str) {
        const char *p;

        assert(str);

        for (p = str; *p; p++)
                if ((unsigned char) *p >= 128)
                        return NULL;

        return (char*) str;
}

/**
 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
 * @out_utf8: output buffer of at least 4 bytes or NULL
 * @g: UCS-4 character to encode
 *
 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
 * The length of the character is returned. It is not zero-terminated! If the
 * output buffer is NULL, only the length is returned.
 *
 * Returns: The length in bytes that the UTF-8 representation does or would
 *          occupy.
 */
size_t utf8_encode_unichar(char *out_utf8, uint32_t g) {

        if (g < (1 << 7)) {
                if (out_utf8)
                        out_utf8[0] = g & 0x7f;
                return 1;
        } else if (g < (1 << 11)) {
                if (out_utf8) {
                        out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
                        out_utf8[1] = 0x80 | (g & 0x3f);
                }
                return 2;
        } else if (g < (1 << 16)) {
                if (out_utf8) {
                        out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
                        out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
                        out_utf8[2] = 0x80 | (g & 0x3f);
                }
                return 3;
        } else if (g < (1 << 21)) {
                if (out_utf8) {
                        out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
                        out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
                        out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
                        out_utf8[3] = 0x80 | (g & 0x3f);
                }
                return 4;
        }

        return 0;
}

char *utf16_to_utf8(const void *s, size_t length) {
        const uint8_t *f;
        char *r, *t;

        r = new(char, (length * 4 + 1) / 2 + 1);
        if (!r)
                return NULL;

        f = s;
        t = r;

        while (f < (const uint8_t*) s + length) {
                uint16_t w1, w2;

                /* see RFC 2781 section 2.2 */

                w1 = f[1] << 8 | f[0];
                f += 2;

                if (!utf16_is_surrogate(w1)) {
                        t += utf8_encode_unichar(t, w1);

                        continue;
                }

                if (utf16_is_trailing_surrogate(w1))
                        continue;
                else if (f >= (const uint8_t*) s + length)
                        break;

                w2 = f[1] << 8 | f[0];
                f += 2;

                if (!utf16_is_trailing_surrogate(w2)) {
                        f -= 2;
                        continue;
                }

                t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
        }

        *t = 0;
        return r;
}

/* expected size used to encode one unicode char */
static int utf8_unichar_to_encoded_len(int unichar) {

        if (unichar < 0x80)
                return 1;
        if (unichar < 0x800)
                return 2;
        if (unichar < 0x10000)
                return 3;
        if (unichar < 0x200000)
                return 4;
        if (unichar < 0x4000000)
                return 5;

        return 6;
}

/* validate one encoded unicode char and return its length */
int utf8_encoded_valid_unichar(const char *str) {
        int len, unichar, i;

        assert(str);

        len = utf8_encoded_expected_len(str);
        if (len == 0)
                return -EINVAL;

        /* ascii is valid */
        if (len == 1)
                return 1;

        /* check if expected encoded chars are available */
        for (i = 0; i < len; i++)
                if ((str[i] & 0x80) != 0x80)
                        return -EINVAL;

        unichar = utf8_encoded_to_unichar(str);

        /* check if encoded length matches encoded value */
        if (utf8_unichar_to_encoded_len(unichar) != len)
                return -EINVAL;

        /* check if value has valid range */
        if (!unichar_is_valid(unichar))
                return -EINVAL;

        return len;
}
Commit	Line	Data
7f110ff9 LP	1	/-- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil --/
	2
	3	/***
	4	This file is part of systemd.
	5
036ae95a	6	Copyright 2008-2011 Kay Sievers
7f110ff9 LP	7	Copyright 2012 Lennart Poettering
	8
	9	systemd is free software; you can redistribute it and/or modify it
5430f7f2 LP	10	under the terms of the GNU Lesser General Public License as published by
5430f7f2 LP	11	the Free Software Foundation; either version 2.1 of the License, or
7f110ff9 LP	12	(at your option) any later version.
	13
	14	systemd is distributed in the hope that it will be useful, but
	15	WITHOUT ANY WARRANTY; without even the implied warranty of
	16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2	17	Lesser General Public License for more details.
7f110ff9	18
5430f7f2	19	You should have received a copy of the GNU Lesser General Public License
7f110ff9 LP	20	along with systemd; If not, see <http://www.gnu.org/licenses/>.
	21	***/
	22
036ae95a	23	/* Parts of this file are based on the GLIB utf8 validation functions. The
7f110ff9 LP	24	* original license text follows. */
	25
	26	/* gutf8.c - Operations on UTF-8 strings.
	27	*
	28	* Copyright (C) 1999 Tom Tromey
	29	* Copyright (C) 2000 Red Hat, Inc.
	30	*
	31	* This library is free software; you can redistribute it and/or
23757887	32	* modify it under the terms of the GNU Library General Public
7f110ff9 LP	33	* License as published by the Free Software Foundation; either
	34	* version 2 of the License, or (at your option) any later version.
	35	*
	36	* This library is distributed in the hope that it will be useful,
	37	* but WITHOUT ANY WARRANTY; without even the implied warranty of
23757887 SK	38	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23757887 SK	39	* Library General Public License for more details.
7f110ff9	40	*
23757887 SK	41	* You should have received a copy of the GNU Library General Public
	42	* License along with this library; if not, write to the Free Software
	43	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7f110ff9 LP	44	*/
	45
	46	#include <errno.h>
7f110ff9	47	#include <inttypes.h>
7f110ff9	48	#include <stdbool.h>
cf0fbc49 TA	49	#include <stdlib.h>
cf0fbc49 TA	50	#include <string.h>
7f110ff9	51
b5efdb8a	52	#include "alloc-util.h"
e4e73a63	53	#include "hexdecoct.h"
7f110ff9	54	#include "utf8.h"
2e3d0692	55	#include "util.h"
7f110ff9	56
dcd12626	57	bool unichar_is_valid(uint32_t ch) {
7f110ff9 LP	58
	59	if (ch >= 0x110000) /* End of unicode space */
	60	return false;
	61	if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
	62	return false;
	63	if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
	64	return false;
	65	if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
	66	return false;
	67
	68	return true;
	69	}
	70
dcd12626	71	static bool unichar_is_control(uint32_t ch) {
ba961854 ZJS	72
	73	/*
	74	0 to ' '-1 is the C0 range.
	75	DEL=0x7F, and DEL+1 to 0x9F is C1 range.
	76	'\t' is in C0 range, but more or less harmless and commonly used.
	77	*/
	78
31f7bf19	79	return (ch < ' ' && ch != '\t' && ch != '\n') \|\|
ba961854 ZJS	80	(0x7F <= ch && ch <= 0x9F);
	81	}
	82
7991ac34 DR	83	/* count of characters used to encode one unicode char */
7991ac34 DR	84	static int utf8_encoded_expected_len(const char *str) {
7e8185ef	85	unsigned char c;
ba961854	86
7e8185ef LP	87	assert(str);
	88
	89	c = (unsigned char) str[0];
7991ac34 DR	90	if (c < 0x80)
	91	return 1;
	92	if ((c & 0xe0) == 0xc0)
	93	return 2;
	94	if ((c & 0xf0) == 0xe0)
	95	return 3;
	96	if ((c & 0xf8) == 0xf0)
	97	return 4;
	98	if ((c & 0xfc) == 0xf8)
	99	return 5;
	100	if ((c & 0xfe) == 0xfc)
	101	return 6;
7e8185ef	102
7991ac34 DR	103	return 0;
7991ac34 DR	104	}
ba961854	105
7991ac34	106	/* decode one unicode char */
dcd12626 LP	107	int utf8_encoded_to_unichar(const char *str) {
dcd12626 LP	108	int unichar, len, i;
7e8185ef LP	109
7e8185ef LP	110	assert(str);
ba961854	111
7991ac34	112	len = utf8_encoded_expected_len(str);
7e8185ef	113
7991ac34 DR	114	switch (len) {
7991ac34 DR	115	case 1:
dcd12626	116	return (int)str[0];
7991ac34 DR	117	case 2:
	118	unichar = str[0] & 0x1f;
	119	break;
	120	case 3:
dcd12626	121	unichar = (int)str[0] & 0x0f;
7991ac34 DR	122	break;
7991ac34 DR	123	case 4:
dcd12626	124	unichar = (int)str[0] & 0x07;
7991ac34 DR	125	break;
7991ac34 DR	126	case 5:
dcd12626	127	unichar = (int)str[0] & 0x03;
7991ac34 DR	128	break;
7991ac34 DR	129	case 6:
dcd12626	130	unichar = (int)str[0] & 0x01;
7991ac34 DR	131	break;
7991ac34 DR	132	default:
7e8185ef	133	return -EINVAL;
ba961854 ZJS	134	}
ba961854 ZJS	135
7991ac34	136	for (i = 1; i < len; i++) {
dcd12626	137	if (((int)str[i] & 0xc0) != 0x80)
7e8185ef	138	return -EINVAL;
7991ac34	139	unichar <<= 6;
dcd12626	140	unichar \|= (int)str[i] & 0x3f;
7991ac34 DR	141	}
	142
	143	return unichar;
ba961854 ZJS	144	}
ba961854 ZJS	145
0ade5ffe	146	bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
6ed62be0	147	const char *p;
7f110ff9 LP	148
	149	assert(str);
	150
6ed62be0	151	for (p = str; length;) {
dcd12626	152	int encoded_len, val;
7e8185ef	153
6ed62be0	154	encoded_len = utf8_encoded_valid_unichar(p);
7e8185ef	155	if (encoded_len < 0 \|\|
144b3d9e LP	156	(size_t) encoded_len > length)
	157	return false;
	158
6ed62be0	159	val = utf8_encoded_to_unichar(p);
144b3d9e	160	if (val < 0 \|\|
f3ee6297	161	unichar_is_control(val) \|\|
0ade5ffe	162	(!newline && val == '\n'))
7991ac34	163	return false;
7f110ff9	164
7991ac34	165	length -= encoded_len;
a7176505	166	p += encoded_len;
7f110ff9 LP	167	}
7f110ff9 LP	168
7991ac34	169	return true;
7f110ff9 LP	170	}
7f110ff9 LP	171
7991ac34 DR	172	const char utf8_is_valid(const char str) {
7991ac34 DR	173	const uint8_t *p;
7f110ff9 LP	174
	175	assert(str);
	176
7991ac34	177	for (p = (const uint8_t) str; p; ) {
faaa5728 LP	178	int len;
	179
	180	len = utf8_encoded_valid_unichar((const char *)p);
7991ac34 DR	181	if (len < 0)
	182	return NULL;
	183
	184	p += len;
	185	}
7f110ff9	186
7991ac34	187	return str;
7f110ff9 LP	188	}
7f110ff9 LP	189
550a40ec ZJS	190	char utf8_escape_invalid(const char str) {
	191	char p, s;
	192
	193	assert(str);
	194
	195	p = s = malloc(strlen(str) * 4 + 1);
	196	if (!p)
	197	return NULL;
	198
	199	while (*str) {
	200	int len;
	201
	202	len = utf8_encoded_valid_unichar(str);
	203	if (len > 0) {
	204	s = mempcpy(s, str, len);
	205	str += len;
	206	} else {
3c6d3052	207	s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
550a40ec ZJS	208	str += 1;
	209	}
	210	}
7e8185ef	211
550a40ec ZJS	212	*s = '\0';
	213
	214	return p;
	215	}
	216
fec84576 WC	217	char utf8_escape_non_printable(const char str) {
	218	char p, s;
	219
	220	assert(str);
	221
	222	p = s = malloc(strlen(str) * 4 + 1);
	223	if (!p)
	224	return NULL;
	225
	226	while (*str) {
	227	int len;
	228
	229	len = utf8_encoded_valid_unichar(str);
	230	if (len > 0) {
	231	if (utf8_is_printable(str, len)) {
	232	s = mempcpy(s, str, len);
	233	str += len;
	234	} else {
3c6d3052	235	while (len > 0) {
fec84576 WC	236	*(s++) = '\\';
	237	*(s++) = 'x';
	238	(s++) = hexchar((int) str >> 4);
	239	(s++) = hexchar((int) str);
fec84576	240
3c6d3052 LP	241	str += 1;
	242	len --;
	243	}
fec84576 WC	244	}
fec84576 WC	245	} else {
3c6d3052	246	s = stpcpy(s, UTF8_REPLACEMENT_CHARACTER);
fec84576 WC	247	str += 1;
	248	}
	249	}
	250
	251	*s = '\0';
	252
	253	return p;
	254	}
	255
7f110ff9 LP	256	char ascii_is_valid(const char str) {
	257	const char *p;
	258
	259	assert(str);
	260
	261	for (p = str; *p; p++)
	262	if ((unsigned char) *p >= 128)
	263	return NULL;
	264
	265	return (char*) str;
	266	}
	267
2bb4c7e3 TG	268	/**
	269	* utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
	270	* @out_utf8: output buffer of at least 4 bytes or NULL
	271	* @g: UCS-4 character to encode
	272	*
	273	* This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
	274	* The length of the character is returned. It is not zero-terminated! If the
	275	* output buffer is NULL, only the length is returned.
	276	*
	277	* Returns: The length in bytes that the UTF-8 representation does or would
	278	* occupy.
	279	*/
dcd12626	280	size_t utf8_encode_unichar(char *out_utf8, uint32_t g) {
f3ee6297	281
2bb4c7e3 TG	282	if (g < (1 << 7)) {
	283	if (out_utf8)
	284	out_utf8[0] = g & 0x7f;
e7eebcfc	285	return 1;
2bb4c7e3 TG	286	} else if (g < (1 << 11)) {
	287	if (out_utf8) {
	288	out_utf8[0] = 0xc0 \| ((g >> 6) & 0x1f);
	289	out_utf8[1] = 0x80 \| (g & 0x3f);
	290	}
e7eebcfc	291	return 2;
2bb4c7e3 TG	292	} else if (g < (1 << 16)) {
	293	if (out_utf8) {
	294	out_utf8[0] = 0xe0 \| ((g >> 12) & 0x0f);
	295	out_utf8[1] = 0x80 \| ((g >> 6) & 0x3f);
	296	out_utf8[2] = 0x80 \| (g & 0x3f);
	297	}
e7eebcfc	298	return 3;
2bb4c7e3 TG	299	} else if (g < (1 << 21)) {
	300	if (out_utf8) {
	301	out_utf8[0] = 0xf0 \| ((g >> 18) & 0x07);
	302	out_utf8[1] = 0x80 \| ((g >> 12) & 0x3f);
	303	out_utf8[2] = 0x80 \| ((g >> 6) & 0x3f);
	304	out_utf8[3] = 0x80 \| (g & 0x3f);
	305	}
	306	return 4;
e7eebcfc	307	}
f3ee6297 LP	308
f3ee6297 LP	309	return 0;
e7eebcfc LP	310	}
e7eebcfc LP	311
2e3d0692	312	char utf16_to_utf8(const void s, size_t length) {
2e3d0692	313	const uint8_t *f;
e7eebcfc	314	char r, t;
2e3d0692	315
04166cb7	316	r = new(char, (length * 4 + 1) / 2 + 1);
2e3d0692 LP	317	if (!r)
	318	return NULL;
	319
04166cb7 TG	320	f = s;
	321	t = r;
	322
	323	while (f < (const uint8_t*) s + length) {
dcd12626	324	uint16_t w1, w2;
04166cb7 TG	325
	326	/* see RFC 2781 section 2.2 */
	327
	328	w1 = f[1] << 8 \| f[0];
	329	f += 2;
	330
	331	if (!utf16_is_surrogate(w1)) {
dcd12626	332	t += utf8_encode_unichar(t, w1);
04166cb7 TG	333
	334	continue;
	335	}
	336
	337	if (utf16_is_trailing_surrogate(w1))
	338	continue;
	339	else if (f >= (const uint8_t*) s + length)
	340	break;
	341
	342	w2 = f[1] << 8 \| f[0];
	343	f += 2;
	344
	345	if (!utf16_is_trailing_surrogate(w2)) {
	346	f -= 2;
	347	continue;
	348	}
	349
	350	t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
	351	}
2e3d0692 LP	352
2e3d0692 LP	353	*t = 0;
7b4d7cc0	354	return r;
2e3d0692	355	}
02a36bc9	356
02a36bc9 DR	357	/* expected size used to encode one unicode char */
02a36bc9 DR	358	static int utf8_unichar_to_encoded_len(int unichar) {
7e8185ef	359
02a36bc9 DR	360	if (unichar < 0x80)
	361	return 1;
	362	if (unichar < 0x800)
	363	return 2;
	364	if (unichar < 0x10000)
	365	return 3;
	366	if (unichar < 0x200000)
	367	return 4;
	368	if (unichar < 0x4000000)
	369	return 5;
7e8185ef	370
02a36bc9 DR	371	return 6;
	372	}
	373
	374	/* validate one encoded unicode char and return its length */
	375	int utf8_encoded_valid_unichar(const char *str) {
dcd12626	376	int len, unichar, i;
7e8185ef LP	377
7e8185ef LP	378	assert(str);
02a36bc9 DR	379
	380	len = utf8_encoded_expected_len(str);
	381	if (len == 0)
7e8185ef	382	return -EINVAL;
02a36bc9 DR	383
	384	/* ascii is valid */
	385	if (len == 1)
	386	return 1;
	387
	388	/* check if expected encoded chars are available */
	389	for (i = 0; i < len; i++)
	390	if ((str[i] & 0x80) != 0x80)
7e8185ef	391	return -EINVAL;
02a36bc9 DR	392
	393	unichar = utf8_encoded_to_unichar(str);
	394
	395	/* check if encoded length matches encoded value */
	396	if (utf8_unichar_to_encoded_len(unichar) != len)
7e8185ef	397	return -EINVAL;
02a36bc9 DR	398
02a36bc9 DR	399	/* check if value has valid range */
f3ee6297	400	if (!unichar_is_valid(unichar))
7e8185ef	401	return -EINVAL;
02a36bc9 DR	402
	403	return len;
	404	}