From: Ralph Boehme <slow@samba.org>
Date: Tue, 9 Apr 2019 09:21:57 +0000 (+0200)
Subject: charset: add support for Unicode normalisation with libicu
X-Git-Tag: tdb-1.4.2~273
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=107020793c7ea44e5e776a1401bbf4f8ccb9bd85;p=thirdparty%2Fsamba.git

charset: add support for Unicode normalisation with libicu

This adds a direct conversion hook using libicu to perform NFC <-> NFD
conversion on UTF8 strings. The defined charset strings are "UTF8-NFC" and
"UTF8-NFD", to convert from one to the other the caller calls smb_iconv_open()
with the desired source and target charsets, eg

  smb_iconv_open("UTF8-NFD", "UTF8-NFC");

for converting from NFC to NFD.

Signed-off-by: Ralph Boehme <slow@samba.org>
Reviewed-by: Andrew Bartlett <abartlet@samba.org>
---

diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c
index 4fae09fda52..14a1f8652e3 100644
--- a/lib/util/charset/iconv.c
+++ b/lib/util/charset/iconv.c
@@ -28,6 +28,11 @@
 #include "libcli/util/ntstatus.h"
 #include "lib/util/util_str_hex.h"
 
+#ifdef HAVE_ICU_I18N
+#include <unicode/ustring.h>
+#include <unicode/utrans.h>
+#endif
+
 #ifdef strcasecmp
 #undef strcasecmp
 #endif
@@ -165,6 +170,109 @@ static size_t sys_iconv(void *cd,
 }
 #endif
 
+#ifdef HAVE_ICU_I18N
+static size_t sys_uconv(void *cd,
+			const char **inbuf,
+			size_t *inbytesleft,
+			char **outbuf,
+			size_t *outbytesleft)
+{
+	UTransliterator *t = (UTransliterator *)cd;
+	size_t bufsize = *inbytesleft * 2;
+	UChar ustr[bufsize];
+	UChar *up = NULL;
+	char *p = NULL;
+	int32_t ustrlen;
+	int32_t limit;
+	int32_t converted_len;
+	size_t inbuf_consumed;
+	size_t outbut_consumed;
+	UErrorCode ue;
+
+	/* Convert from UTF8 to UCS2 */
+	ue = 0;
+	up = u_strFromUTF8(ustr,           /* dst */
+			   bufsize,        /* dst buflen */
+			   &converted_len, /* dst written */
+			   *inbuf,         /* src */
+			   *inbytesleft,   /* src length */
+			   &ue);
+	if (up == NULL || U_FAILURE(ue)) {
+		return -1;
+	}
+	if (converted_len > bufsize) {
+		/*
+		 * u_strFromUTF8() returns the required size in
+		 * converted_len. In theory this should never overflow as the
+		 * ustr[] array is allocated with a size twice as big as
+		 * inbytesleft and converted_len should be equal to inbytesleft,
+		 * but you never know...
+		 */
+		errno = EOVERFLOW;
+		return -1;
+	}
+	inbuf_consumed = converted_len;
+
+	/*
+	 * The following transliteration function takes two parameters, the
+	 * lenght of the text to be converted (converted_len) and a limit which
+	 * may be smaller then converted_len. We just set limit to converted_len
+	 * and also ignore the value returned in limit.
+	 */
+	limit = converted_len;
+
+	/* Inplace transliteration */
+	utrans_transUChars(t,
+			   ustr,           /* text */
+			   &converted_len, /* text length */
+			   bufsize,        /* text buflen */
+			   0,              /* start */
+			   &limit,         /* limit */
+			   &ue);
+	if (U_FAILURE(ue)) {
+		return -1;
+	}
+	if (converted_len > bufsize) {
+		/*
+		 * In theory this should never happen as the ustr[] array is
+		 * allocated with a size twice as big as inbytesleft and
+		 * converted_len should be equal to inbytesleft, but you never
+		 * know...
+		 */
+		errno = EOVERFLOW;
+		return -1;
+	}
+	ustrlen = converted_len;
+
+	/* Convert from UCS2 back to UTF8 */
+	ue = 0;
+	p = u_strToUTF8(*outbuf,        /* dst */
+			*outbytesleft,  /* dst buflen */
+			&converted_len, /* dst required length */
+			ustr,           /* src */
+			ustrlen,        /* src length */
+			&ue);
+	if (p == NULL || U_FAILURE(ue)) {
+		return -1;
+	}
+
+	outbut_consumed = converted_len;
+	if (converted_len > *outbytesleft) {
+		/*
+		 * The caller's result buffer is too small...
+		*/
+		outbut_consumed = *outbytesleft;
+	}
+
+	*inbuf += inbuf_consumed;
+	*inbytesleft -= inbuf_consumed;
+	*outbuf += outbut_consumed;
+	*outbytesleft -= outbut_consumed;
+
+	return converted_len;
+}
+#endif
+
 /**
  * This is a simple portable iconv() implementaion.
  *
@@ -228,6 +336,16 @@ static bool is_utf16(const char *name)
 
 static int smb_iconv_t_destructor(smb_iconv_t hwd)
 {
+#ifdef HAVE_ICU_I18N
+	/*
+	 * This has to come first, as the cd_direct member won't be an iconv
+	 * handle and must not be passed to iconv_close().
+	 */
+	if (hwd->direct == sys_uconv) {
+		utrans_close(hwd->cd_direct);
+		return 0;
+	}
+#endif
 #ifdef HAVE_NATIVE_ICONV
 	if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
 		iconv_close(hwd->cd_pull);
@@ -302,6 +420,52 @@ _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
 	}
 #endif
 
+#ifdef HAVE_ICU_I18N
+	if (strcasecmp(fromcode, "UTF8-NFD") == 0 &&
+	    strcasecmp(tocode, "UTF8-NFC") == 0)
+	{
+		U_STRING_DECL(t, "any-nfc", 7);
+		UErrorCode ue = 0;
+
+		U_STRING_INIT(t, "any-nfc", 7);
+
+		ret->cd_direct = utrans_openU(t,
+					      strlen("any-nfc"),
+					      UTRANS_FORWARD,
+					      NULL,
+					      0,
+					      NULL,
+					      &ue);
+		if (U_FAILURE(ue)) {
+			return (smb_iconv_t)-1;
+		}
+		ret->direct = sys_uconv;
+		return ret;
+	}
+
+	if (strcasecmp(fromcode, "UTF8-NFC") == 0 &&
+	    strcasecmp(tocode, "UTF8-NFD") == 0)
+	{
+		U_STRING_DECL(tname, "any-nfd", 7);
+		UErrorCode ue = 0;
+
+		U_STRING_INIT(tname, "any-nfd", 7);
+
+		ret->cd_direct = utrans_openU(tname,
+					      7,
+					      UTRANS_FORWARD,
+					      NULL,
+					      0,
+					      NULL,
+					      &ue);
+		if (U_FAILURE(ue)) {
+			return (smb_iconv_t)-1;
+		}
+		ret->direct = sys_uconv;
+		return ret;
+	}
+#endif
+
 	if (ret->pull == NULL && from == NULL) {
 		goto failed;
 	}
diff --git a/lib/util/charset/wscript_build b/lib/util/charset/wscript_build
index a3728f6a4bd..8fed718e7dc 100644
--- a/lib/util/charset/wscript_build
+++ b/lib/util/charset/wscript_build
@@ -2,7 +2,7 @@
 
 bld.SAMBA_SUBSYSTEM('ICONV_WRAPPER',
                     source='iconv.c',
-                    public_deps='iconv replace talloc')
+                    public_deps='iconv replace talloc ' +  bld.env['icu-libs'])
 
 bld.SAMBA_SUBSYSTEM('charset',
                     public_headers='charset.h',
diff --git a/lib/util/charset/wscript_configure b/lib/util/charset/wscript_configure
index d5ac5d0100f..9c27fc664f0 100644
--- a/lib/util/charset/wscript_configure
+++ b/lib/util/charset/wscript_configure
@@ -36,3 +36,16 @@ conf.CHECK_CODE('''
                 msg='Checking errno of iconv for illegal multibyte sequence',
                 lib='iconv',
                 headers='errno.h iconv.h')
+
+if conf.CHECK_CFG(package='icu-i18n',
+               args='--cflags --libs',
+               msg='Checking for icu-i18n',
+               uselib_store='ICU_I18N'):
+    for lib in conf.env['LIB_ICU_I18N']:
+        conf.CHECK_LIB(lib, shlib=True, mandatory=True)
+    conf.env['icu-libs'] = ' '.join(conf.env['LIB_ICU_I18N'])
+    if not conf.CHECK_HEADERS('unicode/ustring.h'):
+        conf.fatal('Found libicu, but unicode/ustring.h is missing')
+    conf.DEFINE('HAVE_UTF8_NORMALISATION', 1)
+else:
+    conf.env['icu-libs'] = ''