From: Miecio Za Date: Mon, 18 Mar 2019 18:56:57 +0000 (+0100) Subject: [Minor] Add util.if_utf_mixed_script to lua X-Git-Tag: 1.9.1~38^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7824318dca65d223df397329b3eabde86c3fde02;p=thirdparty%2Frspamd.git [Minor] Add util.if_utf_mixed_script to lua Add new function which implements PoC for chekcing mixed script in utf string. Behaviour is similar to single string spoof detection in libicu before version 58 --- diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 1a37eaef6a..881257ed37 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -29,6 +29,7 @@ #include #include "unicode/uspoof.h" +#include "unicode/uscript.h" /*** * @module rspamd_util @@ -393,6 +394,14 @@ LUA_FUNCTION_DEF (util, normalize_prob); */ LUA_FUNCTION_DEF (util, is_utf_spoofed); +/** +* @function util.is_utf_mixed_script(str) +* Returns true if a string contains mixed unicode scripts +* @param {string} String to check +* @return {boolean} true if a string contains chars with mixed unicode script +*/ +LUA_FUNCTION_DEF (util, is_utf_mixed_script); + /** * @function util.is_utf_outside_range(str, range_start, range_end) * Returns true if a string contains chars outside range @@ -633,6 +642,7 @@ static const struct luaL_reg utillib_f[] = { LUA_INTERFACE_DEF (util, caseless_hash), LUA_INTERFACE_DEF (util, caseless_hash_fast), LUA_INTERFACE_DEF (util, is_utf_spoofed), + LUA_INTERFACE_DEF (util, is_utf_mixed_script), LUA_INTERFACE_DEF (util, is_utf_outside_range), LUA_INTERFACE_DEF (util, get_string_stats), LUA_INTERFACE_DEF (util, is_valid_utf8), @@ -2498,6 +2508,47 @@ lua_util_is_utf_spoofed (lua_State *L) return nres; } +static gint +lua_util_is_utf_mixed_script(lua_State *L) +{ + LUA_TRACE_POINT; + gsize len_of_string; + const gchar *end, *string_to_check = lua_tolstring (L, 1, &len_of_string); + UScriptCode last_script_code = USCRIPT_INVALID_CODE; + UErrorCode uc_err = U_ZERO_ERROR; + + if (string_to_check && g_utf8_validate (string_to_check, len_of_string, &end)) { + len_of_string = g_utf8_strlen (string_to_check, len_of_string); + + for(; *string_to_check; string_to_check = g_utf8_next_char(string_to_check)){ + gunichar char_to_check = g_utf8_get_char(string_to_check); + UScriptCode current_script_code = uscript_getScript(char_to_check, &uc_err); + if (uc_err != U_ZERO_ERROR){ + msg_err ("cannot get unicode script for character, error: %s", u_errorName (uc_err)); + lua_pushboolean (L, false); + return 1; + } + if ( current_script_code != USCRIPT_COMMON && current_script_code != USCRIPT_INHERITED ){ + if (last_script_code == USCRIPT_INVALID_CODE ){ + last_script_code = current_script_code; + } else { + if ( last_script_code != current_script_code ){ + lua_pushboolean (L, true); + return 1; + } + } + } + } + } + else { + return luaL_error (L, "invalid arguments"); + } + + lua_pushboolean (L, false); + + return 1; +} + static gint lua_util_get_string_stats (lua_State *L) {