util/str: helper to check for utf-8 validity

author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>

Thu, 6 Jul 2023 03:31:52 +0000 (15:31 +1200)

committer Andrew Bartlett <abartlet@samba.org>

Tue, 26 Sep 2023 23:45:36 +0000 (23:45 +0000)
author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Thu, 6 Jul 2023 03:31:52 +0000 (15:31 +1200)
committer Andrew Bartlett <abartlet@samba.org>
Tue, 26 Sep 2023 23:45:36 +0000 (23:45 +0000)
diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h

index 941a8be214663b568d3edb7b11b8e73c5e078377..391430b0cf5780a4edd89f565ee43545b1f4c78f 100644 (file)
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -167,6 +167,11 @@ char *strrchr_m(const char *s, char c);
  char *strchr_m(const char *s, char c);
  char *strstr_m(const char *src, const char *findstr);
  
+bool utf8_check(const char *input, size_t maxlen,
+               size_t *byte_len,
+               size_t *char_len,
+               size_t *utf16_len);
+
  bool push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
  bool push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src, size_t *converted_size);
  bool push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c

index f8cadf28507f08b93b7ab7222c970a884c7a42d4..ac80f902362043537f9f9f53b50b145e2261518e 100644 (file)
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@@ -222,6 +222,127 @@ size_t utf16_len_n(const void *src, size_t n)
         return len;
  }
  
+
+/**
+ * Determine the length and validity of a utf-8 string.
+ *
+ * @param input the string pointer
+ * @param maxlen maximum size of the string
+ * @param byte_len receives the length of the valid section
+ * @param char_len receives the number of unicode characters in the valid section
+ * @param utf16_len receives the number of bytes the string would need in UTF16 encoding.
+ *
+ * @return true if the input is valid up to maxlen, or a '\0' byte, otherwise false.
+ */
+bool utf8_check(const char *input, size_t maxlen,
+               size_t *byte_len,
+               size_t *char_len,
+               size_t *utf16_len)
+{
+       const uint8_t *s = (const uint8_t *)input;
+       size_t i;
+       size_t chars = 0;
+       size_t long_chars = 0;
+       uint32_t codepoint;
+       uint8_t a, b, c, d;
+       for (i = 0; i < maxlen; i++, chars++) {
+               if (s[i] == 0) {
+                       break;
+               }
+               if (s[i] < 0x80) {
+                       continue;
+               }
+               if ((s[i] & 0xe0) == 0xc0) {
+                       /* 110xxxxx 10xxxxxx */
+                       a = s[i];
+                       if (maxlen - i < 2) {
+                               goto error;
+                       }
+                       b = s[i + 1];
+                       if ((b & 0xc0) != 0x80) {
+                               goto error;
+                       }
+                       codepoint = (a & 31) << 6 | (b & 63);
+                       if (codepoint < 0x80) {
+                               goto error;
+                       }
+                       i++;
+                       continue;
+               }
+               if ((s[i] & 0xf0) == 0xe0) {
+                       /* 1110xxxx 10xxxxxx 10xxxxxx */
+                       if (maxlen - i < 3) {
+                               goto error;
+                       }
+                       a = s[i];
+                       b = s[i + 1];
+                       c = s[i + 2];
+                       if ((b & 0xc0) != 0x80 || (c & 0xc0) != 0x80) {
+                               goto error;
+                       }
+                       codepoint = (c & 63) | (b & 63) << 6 | (a & 15) << 12;
+
+                       if (codepoint < 0x800) {
+                               goto error;
+                       }
+                       if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
+                               /*
+                                * This is an invalid codepoint, per
+                                * RFC3629, as it encodes part of a
+                                * UTF-16 surrogate pair for a
+                                * character over U+10000, which ought
+                                * to have been encoded as a four byte
+                                * utf-8 sequence.
+                                */
+                               goto error;
+                       }
+                       i += 2;
+                       continue;
+               }
+
+               if ((s[i] & 0xf8) == 0xf0) {
+                       /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+                       if (maxlen - i < 4) {
+                               goto error;
+                       }
+                       a = s[i];
+                       b = s[i + 1];
+                       c = s[i + 2];
+                       d = s[i + 3];
+
+                       if ((b & 0xc0) != 0x80 ||
+                           (c & 0xc0) != 0x80 ||
+                           (d & 0xc0) != 0x80) {
+                               goto error;
+                       }
+                       codepoint = (d & 63) | (c & 63) << 6 | (b & 63) << 12 | (a & 7) << 18;
+
+                       if (codepoint < 0x10000 || codepoint > 0x10ffff) {
+                               goto error;
+                       }
+                       /* this one will need two UTF16 characters */
+                       long_chars++;
+                       i += 3;
+                       continue;
+               }
+               /*
+                * If it wasn't handled yet, it's wrong.
+                */
+               goto error;
+       }
+       *byte_len = i;
+       *char_len = chars;
+       *utf16_len = chars + long_chars;
+       return true;
+
+error:
+       *byte_len = i;
+       *char_len = chars;
+       *utf16_len = chars + long_chars;
+       return false;
+}
+
+
  /**
   * Copy a string from a char* unix src to a dos codepage string destination.
   *
author	Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
	Thu, 6 Jul 2023 03:31:52 +0000 (15:31 +1200)
committer	Andrew Bartlett <abartlet@samba.org>
	Tue, 26 Sep 2023 23:45:36 +0000 (23:45 +0000)
lib/util/charset/charset.h		patch \| blob \| blame \| history
lib/util/charset/util_unistr.c		patch \| blob \| blame \| history