ksmbd: add support for surrogate pair conversion

author Namjae Jeon <linkinjeon@kernel.org>

Sat, 21 Oct 2023 12:01:20 +0000 (21:01 +0900)

committer Steve French <stfrench@microsoft.com>

Mon, 23 Oct 2023 00:06:27 +0000 (19:06 -0500)
author Namjae Jeon <linkinjeon@kernel.org>
Sat, 21 Oct 2023 12:01:20 +0000 (21:01 +0900)
committer Steve French <stfrench@microsoft.com>
Mon, 23 Oct 2023 00:06:27 +0000 (19:06 -0500)
diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c

index 393dd4a7432b659ee897eaefb21e8e593540d983..43ed29ee44ead6ef691dcecb0dd5fe9674bade30 100644 (file)
--- a/fs/smb/server/unicode.c
+++ b/fs/smb/server/unicode.c
@@ -13,46 +13,10 @@
  #include "unicode.h"
  #include "smb_common.h"
  
-/*
- * smb_utf16_bytes() - how long will a string be after conversion?
- * @from:      pointer to input string
- * @maxbytes:  don't go past this many bytes of input string
- * @codepage:  destination codepage
- *
- * Walk a utf16le string and return the number of bytes that the string will
- * be after being converted to the given charset, not including any null
- * termination required. Don't walk past maxbytes in the source buffer.
- *
- * Return:     string length after conversion
- */
-static int smb_utf16_bytes(const __le16 *from, int maxbytes,
-                          const struct nls_table *codepage)
-{
-       int i;
-       int charlen, outlen = 0;
-       int maxwords = maxbytes / 2;
-       char tmp[NLS_MAX_CHARSET_SIZE];
-       __u16 ftmp;
-
-       for (i = 0; i < maxwords; i++) {
-               ftmp = get_unaligned_le16(&from[i]);
-               if (ftmp == 0)
-                       break;
-
-               charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
-               if (charlen > 0)
-                       outlen += charlen;
-               else
-                       outlen++;
-       }
-
-       return outlen;
-}
-
  /*
   * cifs_mapchar() - convert a host-endian char to proper char in codepage
   * @target:    where converted character should be copied
- * @src_char:  2 byte host-endian source character
+ * @from:      host-endian source string
   * @cp:                codepage to which character should be converted
   * @mapchar:   should character be mapped according to mapchars mount option?
   *
@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
   * Return:     string length after conversion
   */
  static int
-cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
              bool mapchar)
  {
         int len = 1;
+       __u16 src_char;
+
+       src_char = *from;
  
         if (!mapchar)
                 goto cp_convert;
@@ -104,12 +71,66 @@ out:
  
  cp_convert:
         len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
-       if (len <= 0) {
-               *target = '?';
-               len = 1;
-       }
+       if (len <= 0)
+               goto surrogate_pair;
  
         goto out;
+
+surrogate_pair:
+       /* convert SURROGATE_PAIR and IVS */
+       if (strcmp(cp->charset, "utf8"))
+               goto unknown;
+       len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+       if (len <= 0)
+               goto unknown;
+       return len;
+
+unknown:
+       *target = '?';
+       len = 1;
+       goto out;
+}
+
+/*
+ * smb_utf16_bytes() - compute converted string length
+ * @from:      pointer to input string
+ * @maxbytes:  input string length
+ * @codepage:  destination codepage
+ *
+ * Walk a utf16le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ *
+ * Return:     string length after conversion
+ */
+static int smb_utf16_bytes(const __le16 *from, int maxbytes,
+                          const struct nls_table *codepage)
+{
+       int i, j;
+       int charlen, outlen = 0;
+       int maxwords = maxbytes / 2;
+       char tmp[NLS_MAX_CHARSET_SIZE];
+       __u16 ftmp[3];
+
+       for (i = 0; i < maxwords; i++) {
+               ftmp[0] = get_unaligned_le16(&from[i]);
+               if (ftmp[0] == 0)
+                       break;
+               for (j = 1; j <= 2; j++) {
+                       if (i + j < maxwords)
+                               ftmp[j] = get_unaligned_le16(&from[i + j]);
+                       else
+                               ftmp[j] = 0;
+               }
+
+               charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
+               if (charlen > 0)
+                       outlen += charlen;
+               else
+                       outlen++;
+       }
+
+       return outlen;
  }
  
  /*
@@ -139,12 +160,12 @@ cp_convert:
  static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
                           const struct nls_table *codepage, bool mapchar)
  {
-       int i, charlen, safelen;
+       int i, j, charlen, safelen;
         int outlen = 0;
         int nullsize = nls_nullsize(codepage);
         int fromwords = fromlen / 2;
         char tmp[NLS_MAX_CHARSET_SIZE];
-       __u16 ftmp;
+       __u16 ftmp[3];  /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
  
         /*
          * because the chars can be of varying widths, we need to take care
@@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
         safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
  
         for (i = 0; i < fromwords; i++) {
-               ftmp = get_unaligned_le16(&from[i]);
-               if (ftmp == 0)
+               ftmp[0] = get_unaligned_le16(&from[i]);
+               if (ftmp[0] == 0)
                         break;
+               for (j = 1; j <= 2; j++) {
+                       if (i + j < fromwords)
+                               ftmp[j] = get_unaligned_le16(&from[i + j]);
+                       else
+                               ftmp[j] = 0;
+               }
  
                 /*
                  * check to see if converting this character might make the
@@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
                 /* put converted char into 'to' buffer */
                 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
                 outlen += charlen;
+
+               /*
+                * charlen (=bytes of UTF-8 for 1 character)
+                * 4bytes UTF-8(surrogate pair) is charlen=4
+                * (4bytes UTF-16 code)
+                * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
+                * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
+                */
+               if (charlen == 4)
+                       i++;
+               else if (charlen >= 5)
+                       /* 5-6bytes UTF-8 */
+                       i += 2;
         }
  
         /* properly null-terminate string */
@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
         char src_char;
         __le16 dst_char;
         wchar_t tmp;
+       wchar_t wchar_to[6];    /* UTF-16 */
+       int ret;
+       unicode_t u;
  
         if (!mapchars)
                 return smb_strtoUTF16(target, source, srclen, cp);
@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
                          * if no match, use question mark, which at least in
                          * some cases serves as wild card
                          */
-                       if (charlen < 1) {
-                               dst_char = cpu_to_le16(0x003f);
-                               charlen = 1;
+                       if (charlen > 0)
+                               goto ctoUTF16;
+
+                       /* convert SURROGATE_PAIR */
+                       if (strcmp(cp->charset, "utf8"))
+                               goto unknown;
+                       if (*(source + i) & 0x80) {
+                               charlen = utf8_to_utf32(source + i, 6, &u);
+                               if (charlen < 0)
+                                       goto unknown;
+                       } else
+                               goto unknown;
+                       ret  = utf8s_to_utf16s(source + i, charlen,
+                                       UTF16_LITTLE_ENDIAN,
+                                       wchar_to, 6);
+                       if (ret < 0)
+                               goto unknown;
+
+                       i += charlen;
+                       dst_char = cpu_to_le16(*wchar_to);
+                       if (charlen <= 3)
+                               /* 1-3bytes UTF-8 to 2bytes UTF-16 */
+                               put_unaligned(dst_char, &target[j]);
+                       else if (charlen == 4) {
+                               /*
+                                * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
+                                * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
+                                * (charlen=3+4 or 4+4)
+                                */
+                               put_unaligned(dst_char, &target[j]);
+                               dst_char = cpu_to_le16(*(wchar_to + 1));
+                               j++;
+                               put_unaligned(dst_char, &target[j]);
+                       } else if (charlen >= 5) {
+                               /* 5-6bytes UTF-8 to 6bytes UTF-16 */
+                               put_unaligned(dst_char, &target[j]);
+                               dst_char = cpu_to_le16(*(wchar_to + 1));
+                               j++;
+                               put_unaligned(dst_char, &target[j]);
+                               dst_char = cpu_to_le16(*(wchar_to + 2));
+                               j++;
+                               put_unaligned(dst_char, &target[j]);
                         }
+                       continue;
+
+unknown:
+                       dst_char = cpu_to_le16(0x003f);
+                       charlen = 1;
                 }
+
+ctoUTF16:
                 /*
                  * character may take more than one byte in the source string,
                  * but will take exactly two bytes in the target string
author	Namjae Jeon <linkinjeon@kernel.org>
	Sat, 21 Oct 2023 12:01:20 +0000 (21:01 +0900)
committer	Steve French <stfrench@microsoft.com>
	Mon, 23 Oct 2023 00:06:27 +0000 (19:06 -0500)