From: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Date: Thu, 6 Jul 2023 03:31:52 +0000 (+1200)
Subject: util/str: helper to check for utf-8 validity
X-Git-Tag: tevent-0.16.0~402
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e4da279b1c06711c27e2aa1a4e36f35b674eaca4;p=thirdparty%2Fsamba.git

util/str: helper to check for utf-8 validity

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Reviewed-by: Andrew Bartlett <abartlet@samba.org>
---

diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h
index 941a8be2146..391430b0cf5 100644
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -167,6 +167,11 @@ char *strrchr_m(const char *s, char c);
 char *strchr_m(const char *s, char c);
 char *strstr_m(const char *src, const char *findstr);
 
+bool utf8_check(const char *input, size_t maxlen,
+		size_t *byte_len,
+		size_t *char_len,
+		size_t *utf16_len);
+
 bool push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
 bool push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src, size_t *converted_size);
 bool push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c
index f8cadf28507..ac80f902362 100644
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@@ -222,6 +222,127 @@ size_t utf16_len_n(const void *src, size_t n)
 	return len;
 }
 
+
+/**
+ * Determine the length and validity of a utf-8 string.
+ *
+ * @param input the string pointer
+ * @param maxlen maximum size of the string
+ * @param byte_len receives the length of the valid section
+ * @param char_len receives the number of unicode characters in the valid section
+ * @param utf16_len receives the number of bytes the string would need in UTF16 encoding.
+ *
+ * @return true if the input is valid up to maxlen, or a '\0' byte, otherwise false.
+ */
+bool utf8_check(const char *input, size_t maxlen,
+		size_t *byte_len,
+		size_t *char_len,
+		size_t *utf16_len)
+{
+	const uint8_t *s = (const uint8_t *)input;
+	size_t i;
+	size_t chars = 0;
+	size_t long_chars = 0;
+	uint32_t codepoint;
+	uint8_t a, b, c, d;
+	for (i = 0; i < maxlen; i++, chars++) {
+		if (s[i] == 0) {
+			break;
+		}
+		if (s[i] < 0x80) {
+			continue;
+		}
+		if ((s[i] & 0xe0) == 0xc0) {
+			/* 110xxxxx 10xxxxxx */
+			a = s[i];
+			if (maxlen - i < 2) {
+				goto error;
+			}
+			b = s[i + 1];
+			if ((b & 0xc0) != 0x80) {
+				goto error;
+			}
+			codepoint = (a & 31) << 6 | (b & 63);
+			if (codepoint < 0x80) {
+				goto error;
+			}
+			i++;
+			continue;
+		}
+		if ((s[i] & 0xf0) == 0xe0) {
+			/* 1110xxxx 10xxxxxx 10xxxxxx */
+			if (maxlen - i < 3) {
+				goto error;
+			}
+			a = s[i];
+			b = s[i + 1];
+			c = s[i + 2];
+			if ((b & 0xc0) != 0x80 || (c & 0xc0) != 0x80) {
+				goto error;
+			}
+			codepoint = (c & 63) | (b & 63) << 6 | (a & 15) << 12;
+
+			if (codepoint < 0x800) {
+				goto error;
+			}
+			if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
+				/*
+				 * This is an invalid codepoint, per
+				 * RFC3629, as it encodes part of a
+				 * UTF-16 surrogate pair for a
+				 * character over U+10000, which ought
+				 * to have been encoded as a four byte
+				 * utf-8 sequence.
+				 */
+				goto error;
+			}
+			i += 2;
+			continue;
+		}
+
+		if ((s[i] & 0xf8) == 0xf0) {
+			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+			if (maxlen - i < 4) {
+				goto error;
+			}
+			a = s[i];
+			b = s[i + 1];
+			c = s[i + 2];
+			d = s[i + 3];
+
+			if ((b & 0xc0) != 0x80 ||
+			    (c & 0xc0) != 0x80 ||
+			    (d & 0xc0) != 0x80) {
+				goto error;
+			}
+			codepoint = (d & 63) | (c & 63) << 6 | (b & 63) << 12 | (a & 7) << 18;
+
+			if (codepoint < 0x10000 || codepoint > 0x10ffff) {
+				goto error;
+			}
+			/* this one will need two UTF16 characters */
+			long_chars++;
+			i += 3;
+			continue;
+		}
+		/*
+		 * If it wasn't handled yet, it's wrong.
+		 */
+		goto error;
+	}
+	*byte_len = i;
+	*char_len = chars;
+	*utf16_len = chars + long_chars;
+	return true;
+
+error:
+	*byte_len = i;
+	*char_len = chars;
+	*utf16_len = chars + long_chars;
+	return false;
+}
+
+
 /**
  * Copy a string from a char* unix src to a dos codepage string destination.
  *