From: Ralph Boehme Date: Tue, 9 Apr 2019 09:21:57 +0000 (+0200) Subject: charset: add support for Unicode normalisation with libicu X-Git-Tag: tdb-1.4.2~273 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=107020793c7ea44e5e776a1401bbf4f8ccb9bd85;p=thirdparty%2Fsamba.git charset: add support for Unicode normalisation with libicu This adds a direct conversion hook using libicu to perform NFC <-> NFD conversion on UTF8 strings. The defined charset strings are "UTF8-NFC" and "UTF8-NFD", to convert from one to the other the caller calls smb_iconv_open() with the desired source and target charsets, eg smb_iconv_open("UTF8-NFD", "UTF8-NFC"); for converting from NFC to NFD. Signed-off-by: Ralph Boehme Reviewed-by: Andrew Bartlett --- diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c index 4fae09fda52..14a1f8652e3 100644 --- a/lib/util/charset/iconv.c +++ b/lib/util/charset/iconv.c @@ -28,6 +28,11 @@ #include "libcli/util/ntstatus.h" #include "lib/util/util_str_hex.h" +#ifdef HAVE_ICU_I18N +#include +#include +#endif + #ifdef strcasecmp #undef strcasecmp #endif @@ -165,6 +170,109 @@ static size_t sys_iconv(void *cd, } #endif +#ifdef HAVE_ICU_I18N +static size_t sys_uconv(void *cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft) +{ + UTransliterator *t = (UTransliterator *)cd; + size_t bufsize = *inbytesleft * 2; + UChar ustr[bufsize]; + UChar *up = NULL; + char *p = NULL; + int32_t ustrlen; + int32_t limit; + int32_t converted_len; + size_t inbuf_consumed; + size_t outbut_consumed; + UErrorCode ue; + + /* Convert from UTF8 to UCS2 */ + ue = 0; + up = u_strFromUTF8(ustr, /* dst */ + bufsize, /* dst buflen */ + &converted_len, /* dst written */ + *inbuf, /* src */ + *inbytesleft, /* src length */ + &ue); + if (up == NULL || U_FAILURE(ue)) { + return -1; + } + if (converted_len > bufsize) { + /* + * u_strFromUTF8() returns the required size in + * converted_len. In theory this should never overflow as the + * ustr[] array is allocated with a size twice as big as + * inbytesleft and converted_len should be equal to inbytesleft, + * but you never know... + */ + errno = EOVERFLOW; + return -1; + } + inbuf_consumed = converted_len; + + /* + * The following transliteration function takes two parameters, the + * lenght of the text to be converted (converted_len) and a limit which + * may be smaller then converted_len. We just set limit to converted_len + * and also ignore the value returned in limit. + */ + limit = converted_len; + + /* Inplace transliteration */ + utrans_transUChars(t, + ustr, /* text */ + &converted_len, /* text length */ + bufsize, /* text buflen */ + 0, /* start */ + &limit, /* limit */ + &ue); + if (U_FAILURE(ue)) { + return -1; + } + if (converted_len > bufsize) { + /* + * In theory this should never happen as the ustr[] array is + * allocated with a size twice as big as inbytesleft and + * converted_len should be equal to inbytesleft, but you never + * know... + */ + errno = EOVERFLOW; + return -1; + } + ustrlen = converted_len; + + /* Convert from UCS2 back to UTF8 */ + ue = 0; + p = u_strToUTF8(*outbuf, /* dst */ + *outbytesleft, /* dst buflen */ + &converted_len, /* dst required length */ + ustr, /* src */ + ustrlen, /* src length */ + &ue); + if (p == NULL || U_FAILURE(ue)) { + return -1; + } + + outbut_consumed = converted_len; + if (converted_len > *outbytesleft) { + /* + * The caller's result buffer is too small... + */ + outbut_consumed = *outbytesleft; + } + + *inbuf += inbuf_consumed; + *inbytesleft -= inbuf_consumed; + *outbuf += outbut_consumed; + *outbytesleft -= outbut_consumed; + + return converted_len; +} +#endif + /** * This is a simple portable iconv() implementaion. * @@ -228,6 +336,16 @@ static bool is_utf16(const char *name) static int smb_iconv_t_destructor(smb_iconv_t hwd) { +#ifdef HAVE_ICU_I18N + /* + * This has to come first, as the cd_direct member won't be an iconv + * handle and must not be passed to iconv_close(). + */ + if (hwd->direct == sys_uconv) { + utrans_close(hwd->cd_direct); + return 0; + } +#endif #ifdef HAVE_NATIVE_ICONV if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1) iconv_close(hwd->cd_pull); @@ -302,6 +420,52 @@ _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, } #endif +#ifdef HAVE_ICU_I18N + if (strcasecmp(fromcode, "UTF8-NFD") == 0 && + strcasecmp(tocode, "UTF8-NFC") == 0) + { + U_STRING_DECL(t, "any-nfc", 7); + UErrorCode ue = 0; + + U_STRING_INIT(t, "any-nfc", 7); + + ret->cd_direct = utrans_openU(t, + strlen("any-nfc"), + UTRANS_FORWARD, + NULL, + 0, + NULL, + &ue); + if (U_FAILURE(ue)) { + return (smb_iconv_t)-1; + } + ret->direct = sys_uconv; + return ret; + } + + if (strcasecmp(fromcode, "UTF8-NFC") == 0 && + strcasecmp(tocode, "UTF8-NFD") == 0) + { + U_STRING_DECL(tname, "any-nfd", 7); + UErrorCode ue = 0; + + U_STRING_INIT(tname, "any-nfd", 7); + + ret->cd_direct = utrans_openU(tname, + 7, + UTRANS_FORWARD, + NULL, + 0, + NULL, + &ue); + if (U_FAILURE(ue)) { + return (smb_iconv_t)-1; + } + ret->direct = sys_uconv; + return ret; + } +#endif + if (ret->pull == NULL && from == NULL) { goto failed; } diff --git a/lib/util/charset/wscript_build b/lib/util/charset/wscript_build index a3728f6a4bd..8fed718e7dc 100644 --- a/lib/util/charset/wscript_build +++ b/lib/util/charset/wscript_build @@ -2,7 +2,7 @@ bld.SAMBA_SUBSYSTEM('ICONV_WRAPPER', source='iconv.c', - public_deps='iconv replace talloc') + public_deps='iconv replace talloc ' + bld.env['icu-libs']) bld.SAMBA_SUBSYSTEM('charset', public_headers='charset.h', diff --git a/lib/util/charset/wscript_configure b/lib/util/charset/wscript_configure index d5ac5d0100f..9c27fc664f0 100644 --- a/lib/util/charset/wscript_configure +++ b/lib/util/charset/wscript_configure @@ -36,3 +36,16 @@ conf.CHECK_CODE(''' msg='Checking errno of iconv for illegal multibyte sequence', lib='iconv', headers='errno.h iconv.h') + +if conf.CHECK_CFG(package='icu-i18n', + args='--cflags --libs', + msg='Checking for icu-i18n', + uselib_store='ICU_I18N'): + for lib in conf.env['LIB_ICU_I18N']: + conf.CHECK_LIB(lib, shlib=True, mandatory=True) + conf.env['icu-libs'] = ' '.join(conf.env['LIB_ICU_I18N']) + if not conf.CHECK_HEADERS('unicode/ustring.h'): + conf.fatal('Found libicu, but unicode/ustring.h is missing') + conf.DEFINE('HAVE_UTF8_NORMALISATION', 1) +else: + conf.env['icu-libs'] = ''