From: Vsevolod Stakhov Date: Mon, 15 Dec 2025 12:25:56 +0000 (+0000) Subject: [Fix] Fix get_text_quality bugs and clean up verbose comments X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=744fdbe742197253b8babd41d5293d3b0414bd8b;p=thirdparty%2Frspamd.git [Fix] Fix get_text_quality bugs and clean up verbose comments Fix two bugs in lua_util_get_text_quality: - Pass valid UErrorCode pointer to uscript_getScript() (was returning -1) - Move emoji check after digit check (0-9 have UCHAR_EMOJI property) Clean up verbose comments in pdf.lua config and ligature handling. --- diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index 2685779333..70aa820211 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -104,14 +104,10 @@ local config = { max_pdf_objects = 10000, -- Maximum number of objects to be considered max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse) max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer - -- Timeout for PDF processing in seconds. If exceeded, text extraction is skipped - -- as partial results would be incorrect. Can be overridden via - -- pdf.pdf_process_timeout in configuration. - pdf_process_timeout = 2.0, - -- Text quality filtering options for garbage detection - text_quality_threshold = 0.4, -- Minimum confidence score (0.0-1.0) to accept extracted text + pdf_process_timeout = 2.0, -- Timeout in seconds for processing + text_quality_threshold = 0.4, -- Minimum confidence to accept extracted text text_quality_min_length = 10, -- Minimum text length to apply quality filtering - text_quality_enabled = true, -- Enable/disable text quality filtering + text_quality_enabled = true, -- Enable text quality filtering } -- Used to process patterns found in PDF @@ -188,15 +184,7 @@ local function generic_grammar_elts() end if res then - -- Ligature fix for StandardEncoding (common in simple PDFs) - -- 0xAB (171) -> ff - -- 0xAC (172) -> ffi - -- 0xAD (173) -> ffl - -- 0xAE (174) -> fi - -- 0xAF (175) -> fl - -- MacRomanEncoding - -- 0xDE (222) -> fi - -- 0xDF (223) -> fl + -- StandardEncoding/MacRomanEncoding ligature substitutions res = res:gsub('\171', 'ff') res = res:gsub('\172', 'ffi') res = res:gsub('\173', 'ffl') @@ -228,15 +216,7 @@ local function generic_grammar_elts() end s = s:gsub('\\%d%d?%d?', ue_octal) - -- Ligature fix for StandardEncoding (common in simple PDFs) - -- 0xAB (171) -> ff - -- 0xAC (172) -> ffi - -- 0xAD (173) -> ffl - -- 0xAE (174) -> fi - -- 0xAF (175) -> fl - -- MacRomanEncoding - -- 0xDE (222) -> fi - -- 0xDF (223) -> fl + -- StandardEncoding/MacRomanEncoding ligature substitutions s = s:gsub('\171', 'ff') s = s:gsub('\172', 'ffi') s = s:gsub('\173', 'ffl') diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 218fdf7fb2..8de4412d90 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -2831,6 +2831,7 @@ lua_util_get_text_quality(lua_State *L) int32_t i = 0; UChar32 uc, prev_uc = 0; UScriptCode prev_script = USCRIPT_INVALID_CODE; + UErrorCode uc_err = U_ZERO_ERROR; /* Basic counts */ int letters = 0; @@ -2944,22 +2945,6 @@ lua_util_get_text_quality(lua_State *L) non_ascii_chars++; } - /* Check for emoji */ - if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) { - emojis++; - printable++; - /* Emojis break words */ - if (in_word && current_word_len >= 2) { - words++; - word_chars += current_word_len; - } - current_word_len = 0; - in_word = FALSE; - prev_was_space = FALSE; - prev_script = USCRIPT_INVALID_CODE; - continue; - } - /* Check if it's a letter (any Unicode script) */ if (u_isalpha(uc)) { letters++; @@ -2976,7 +2961,8 @@ lua_util_get_text_quality(lua_State *L) } /* Latin vowel/consonant detection */ - UScriptCode script = uscript_getScript(uc, NULL); + uc_err = U_ZERO_ERROR; + UScriptCode script = uscript_getScript(uc, &uc_err); if (script == USCRIPT_LATIN) { if (is_latin_vowel(uc)) { latin_vowels++; @@ -3043,6 +3029,20 @@ lua_util_get_text_quality(lua_State *L) in_word = FALSE; prev_was_space = FALSE; } + else if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) { + /* Check for emoji (after digits/letters since 0-9 have UCHAR_EMOJI property) */ + emojis++; + printable++; + /* Emojis break words */ + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + in_word = FALSE; + prev_was_space = FALSE; + prev_script = USCRIPT_INVALID_CODE; + } else if (u_isgraph(uc)) { /* Other printable characters (symbols, etc.) */ printable++; @@ -3075,10 +3075,8 @@ lua_util_get_text_quality(lua_State *L) word_chars += current_word_len; } - /* Suppress unused variable warning */ (void) prev_uc; - /* Build result table with all metrics */ lua_createtable(L, 0, 18); lua_pushstring(L, "letters");