From: Vsevolod Stakhov Date: Thu, 11 Dec 2025 18:11:36 +0000 (+0000) Subject: [Feature] Add text quality analysis for PDF garbage filtering X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b62818f14f5f6977b5bad6c79b52ab8315c4fea4;p=thirdparty%2Frspamd.git [Feature] Add text quality analysis for PDF garbage filtering - Add rspamd_util.get_text_quality() function with comprehensive UTF-8 text analysis using ICU for proper Unicode classification - Returns 18 metrics: letters, digits, punctuation, spaces, printable, words, word_chars, total, emojis, uppercase, lowercase, ascii_chars, non_ascii_chars, latin_vowels, latin_consonants, script_transitions, double_spaces, non_printable - Add confidence scoring to PDF text extraction to filter garbage tokens (single characters, encoded data, random sequences) - Configurable via text_quality_threshold, text_quality_min_length, text_quality_enabled options in pdf module config - Add unit tests for get_text_quality function --- diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index 550465c1a0..2685779333 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -108,6 +108,10 @@ local config = { -- as partial results would be incorrect. Can be overridden via -- pdf.pdf_process_timeout in configuration. pdf_process_timeout = 2.0, + -- Text quality filtering options for garbage detection + text_quality_threshold = 0.4, -- Minimum confidence score (0.0-1.0) to accept extracted text + text_quality_min_length = 10, -- Minimum text length to apply quality filtering + text_quality_enabled = true, -- Enable/disable text quality filtering } -- Used to process patterns found in PDF @@ -330,6 +334,73 @@ local function gen_graphics_nary() P("RG") + P("rg") end +-- Calculate text quality confidence score using UTF-8 aware analysis +-- Returns a score between 0.0 (garbage) and 1.0 (high quality text) +local function calculate_text_confidence(text) + if not text or #text < config.text_quality_min_length then + return 1.0 -- Don't filter short text + end + + local stats = rspamd_util.get_text_quality(text) + if not stats or stats.total == 0 then + return 0.0 + end + + local score = 0.0 + local non_ws = stats.total - stats.spaces + + -- Printable ratio (weight: 0.25) - target > 0.95 + local printable_ratio = stats.printable / stats.total + score = score + math.min(printable_ratio / 0.95, 1.0) * 0.25 + + -- Letter ratio (weight: 0.20) - target > 0.6 + local letter_ratio = 0 + if non_ws > 0 then + letter_ratio = stats.letters / non_ws + end + score = score + math.min(letter_ratio / 0.6, 1.0) * 0.20 + + -- Word ratio (weight: 0.25) - target > 0.7 + local word_ratio = 0 + if non_ws > 0 then + word_ratio = stats.word_chars / non_ws + end + score = score + math.min(word_ratio / 0.7, 1.0) * 0.25 + + -- Average word length (weight: 0.15) - ideal: 3-10 + local avg_word_len = 0 + if stats.words > 0 then + avg_word_len = stats.word_chars / stats.words + end + local word_len_score = 0 + if avg_word_len >= 3 and avg_word_len <= 10 then + word_len_score = 1.0 + elseif avg_word_len >= 2 and avg_word_len < 3 then + word_len_score = 0.7 + elseif avg_word_len > 10 and avg_word_len <= 15 then + word_len_score = 0.5 + else + word_len_score = 0.2 + end + score = score + word_len_score * 0.15 + + -- Space ratio (weight: 0.15) - ideal: 0.08-0.25 + local space_ratio = stats.spaces / stats.total + local space_score = 0 + if space_ratio >= 0.08 and space_ratio <= 0.25 then + space_score = 1.0 + elseif space_ratio > 0.25 and space_ratio <= 0.4 then + space_score = 0.6 + elseif space_ratio > 0 and space_ratio < 0.08 then + space_score = 0.5 + else + space_score = 0.2 + end + score = score + space_score * 0.15 + + return score +end + -- Generates a grammar to parse text blocks (between BT and ET) local function gen_text_grammar() local V = lpeg.V @@ -442,6 +513,16 @@ local function gen_text_grammar() res = sanitize_pdf_text(res) + -- Apply text quality filtering to reject garbage chunks + if config.text_quality_enabled and res and #res >= config.text_quality_min_length then + local confidence = calculate_text_confidence(res) + if confidence < config.text_quality_threshold then + lua_util.debugm(N, nil, 'rejected low confidence text chunk (%.2f): %s', + confidence, res:sub(1, 50)) + return '' + end + end + if op == "'" or op == '"' then return '\n' .. res end @@ -1398,10 +1479,24 @@ local function search_text(task, pdf, mpart) end end local res = table.concat(text, '') - obj.text = rspamd_text.fromstring(res) - lua_util.debugm(N, task, 'object %s:%s is parsed to: %s', - obj.major, obj.minor, obj.text) + -- Page-level confidence check before storing text + if config.text_quality_enabled and #res >= config.text_quality_min_length then + local page_confidence = calculate_text_confidence(res) + if page_confidence < config.text_quality_threshold then + lua_util.debugm(N, task, 'skipping low confidence page text for %s:%s (%.2f)', + obj.major, obj.minor, page_confidence) + -- Don't store this page's text + else + obj.text = rspamd_text.fromstring(res) + lua_util.debugm(N, task, 'object %s:%s is parsed (confidence: %.2f): %s', + obj.major, obj.minor, page_confidence, obj.text) + end + else + obj.text = rspamd_text.fromstring(res) + lua_util.debugm(N, task, 'object %s:%s is parsed to: %s', + obj.major, obj.minor, obj.text) + end end end end diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 24fcd7233d..218fdf7fb2 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -45,6 +45,7 @@ #include "unicode/uspoof.h" #include "unicode/uscript.h" +#include "unicode/uchar.h" #include #include "rspamd_simdutf.h" @@ -519,6 +520,34 @@ LUA_FUNCTION_DEF(util, is_valid_utf8); */ LUA_FUNCTION_DEF(util, has_obscured_unicode); +/*** + * @function util.get_text_quality(str) + * Analyzes text quality for UTF-8 strings, useful for filtering garbage text extracted from PDFs + * and other text quality analysis tasks. Uses ICU for proper Unicode character classification + * (supports all scripts). + * @param {string|rspamd_text} str input text to analyze + * @return {table} table with comprehensive text quality metrics: + * - letters: count of Unicode letters (any script) + * - digits: count of Unicode digits + * - punctuation: count of punctuation characters + * - spaces: count of whitespace characters + * - printable: count of all printable characters + * - words: count of word-like sequences (2+ consecutive letters) + * - word_chars: total characters in words + * - total: total character count + * - emojis: count of emoji characters + * - uppercase: count of uppercase letters + * - lowercase: count of lowercase letters + * - ascii_chars: count of ASCII characters (0-127) + * - non_ascii_chars: count of non-ASCII characters + * - latin_vowels: count of Latin vowels (a,e,i,o,u) + * - latin_consonants: count of Latin consonants + * - script_transitions: count of script changes (e.g., Latin to Cyrillic) + * - double_spaces: count of consecutive space sequences + * - non_printable: count of non-printable/invalid characters + */ +LUA_FUNCTION_DEF(util, get_text_quality); + /*** * @function util.readline([prompt]) * Returns string read from stdin with history and editing support @@ -779,6 +808,7 @@ static const struct luaL_reg utillib_f[] = { LUA_INTERFACE_DEF(util, get_string_stats), LUA_INTERFACE_DEF(util, is_valid_utf8), LUA_INTERFACE_DEF(util, has_obscured_unicode), + LUA_INTERFACE_DEF(util, get_text_quality), LUA_INTERFACE_DEF(util, readline), LUA_INTERFACE_DEF(util, readpassphrase), LUA_INTERFACE_DEF(util, file_exists), @@ -2785,6 +2815,347 @@ lua_util_has_obscured_unicode(lua_State *L) return 1; } +/* Helper to check if a character is a Latin vowel */ +static inline gboolean +is_latin_vowel(UChar32 uc) +{ + /* Lowercase and uppercase Latin vowels */ + return uc == 'a' || uc == 'e' || uc == 'i' || uc == 'o' || uc == 'u' || + uc == 'A' || uc == 'E' || uc == 'I' || uc == 'O' || uc == 'U'; +} + +static int +lua_util_get_text_quality(lua_State *L) +{ + LUA_TRACE_POINT; + int32_t i = 0; + UChar32 uc, prev_uc = 0; + UScriptCode prev_script = USCRIPT_INVALID_CODE; + + /* Basic counts */ + int letters = 0; + int spaces = 0; + int printable = 0; + int total = 0; + int words = 0; + int word_chars = 0; + int current_word_len = 0; + gboolean in_word = FALSE; + + /* Extended metrics */ + int digits = 0; + int punctuation = 0; + int emojis = 0; + int uppercase = 0; + int lowercase = 0; + int ascii_chars = 0; + int non_ascii_chars = 0; + int latin_vowels = 0; + int latin_consonants = 0; + int script_transitions = 0; + int double_spaces = 0; + int non_printable = 0; + gboolean prev_was_space = FALSE; + + struct rspamd_lua_text *t = lua_check_text_or_string(L, 1); + + if (t == NULL || t->len == 0) { + lua_createtable(L, 0, 18); + lua_pushstring(L, "letters"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "digits"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "punctuation"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "spaces"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "printable"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "words"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "word_chars"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "total"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "emojis"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "uppercase"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "lowercase"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "ascii_chars"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "non_ascii_chars"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "latin_vowels"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "latin_consonants"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "script_transitions"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "double_spaces"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + lua_pushstring(L, "non_printable"); + lua_pushinteger(L, 0); + lua_settable(L, -3); + return 1; + } + + while (i < t->len) { + U8_NEXT(t->start, i, t->len, uc); + total++; + + if (uc < 0) { + /* Invalid UTF-8 sequence */ + non_printable++; + in_word = FALSE; + if (current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + prev_was_space = FALSE; + prev_script = USCRIPT_INVALID_CODE; + continue; + } + + /* ASCII vs non-ASCII */ + if (uc <= 127) { + ascii_chars++; + } + else { + non_ascii_chars++; + } + + /* Check for emoji */ + if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) { + emojis++; + printable++; + /* Emojis break words */ + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + in_word = FALSE; + prev_was_space = FALSE; + prev_script = USCRIPT_INVALID_CODE; + continue; + } + + /* Check if it's a letter (any Unicode script) */ + if (u_isalpha(uc)) { + letters++; + printable++; + current_word_len++; + in_word = TRUE; + + /* Case detection */ + if (u_isupper(uc)) { + uppercase++; + } + else if (u_islower(uc)) { + lowercase++; + } + + /* Latin vowel/consonant detection */ + UScriptCode script = uscript_getScript(uc, NULL); + if (script == USCRIPT_LATIN) { + if (is_latin_vowel(uc)) { + latin_vowels++; + } + else { + latin_consonants++; + } + } + + /* Script transition detection (only for letters) */ + if (prev_script != USCRIPT_INVALID_CODE && + prev_script != USCRIPT_COMMON && + prev_script != USCRIPT_INHERITED && + script != USCRIPT_COMMON && + script != USCRIPT_INHERITED && + script != prev_script) { + script_transitions++; + } + if (script != USCRIPT_COMMON && script != USCRIPT_INHERITED) { + prev_script = script; + } + + prev_was_space = FALSE; + } + else if (u_isdigit(uc)) { + digits++; + printable++; + /* Digits break words for our purposes */ + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + in_word = FALSE; + prev_was_space = FALSE; + } + else if (u_isUWhiteSpace(uc)) { + spaces++; + printable++; + + /* Double space detection */ + if (prev_was_space) { + double_spaces++; + } + prev_was_space = TRUE; + + /* End of word */ + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + in_word = FALSE; + } + else if (u_ispunct(uc)) { + punctuation++; + printable++; + /* Punctuation breaks words */ + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + in_word = FALSE; + prev_was_space = FALSE; + } + else if (u_isgraph(uc)) { + /* Other printable characters (symbols, etc.) */ + printable++; + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + in_word = FALSE; + prev_was_space = FALSE; + } + else { + /* Non-printable characters */ + non_printable++; + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + current_word_len = 0; + in_word = FALSE; + prev_was_space = FALSE; + } + + prev_uc = uc; + } + + /* Handle trailing word */ + if (in_word && current_word_len >= 2) { + words++; + word_chars += current_word_len; + } + + /* Suppress unused variable warning */ + (void) prev_uc; + + /* Build result table with all metrics */ + lua_createtable(L, 0, 18); + + lua_pushstring(L, "letters"); + lua_pushinteger(L, letters); + lua_settable(L, -3); + + lua_pushstring(L, "digits"); + lua_pushinteger(L, digits); + lua_settable(L, -3); + + lua_pushstring(L, "punctuation"); + lua_pushinteger(L, punctuation); + lua_settable(L, -3); + + lua_pushstring(L, "spaces"); + lua_pushinteger(L, spaces); + lua_settable(L, -3); + + lua_pushstring(L, "printable"); + lua_pushinteger(L, printable); + lua_settable(L, -3); + + lua_pushstring(L, "words"); + lua_pushinteger(L, words); + lua_settable(L, -3); + + lua_pushstring(L, "word_chars"); + lua_pushinteger(L, word_chars); + lua_settable(L, -3); + + lua_pushstring(L, "total"); + lua_pushinteger(L, total); + lua_settable(L, -3); + + lua_pushstring(L, "emojis"); + lua_pushinteger(L, emojis); + lua_settable(L, -3); + + lua_pushstring(L, "uppercase"); + lua_pushinteger(L, uppercase); + lua_settable(L, -3); + + lua_pushstring(L, "lowercase"); + lua_pushinteger(L, lowercase); + lua_settable(L, -3); + + lua_pushstring(L, "ascii_chars"); + lua_pushinteger(L, ascii_chars); + lua_settable(L, -3); + + lua_pushstring(L, "non_ascii_chars"); + lua_pushinteger(L, non_ascii_chars); + lua_settable(L, -3); + + lua_pushstring(L, "latin_vowels"); + lua_pushinteger(L, latin_vowels); + lua_settable(L, -3); + + lua_pushstring(L, "latin_consonants"); + lua_pushinteger(L, latin_consonants); + lua_settable(L, -3); + + lua_pushstring(L, "script_transitions"); + lua_pushinteger(L, script_transitions); + lua_settable(L, -3); + + lua_pushstring(L, "double_spaces"); + lua_pushinteger(L, double_spaces); + lua_settable(L, -3); + + lua_pushstring(L, "non_printable"); + lua_pushinteger(L, non_printable); + lua_settable(L, -3); + + return 1; +} + static int lua_util_readline(lua_State *L) { diff --git a/test/lua/unit/rspamd_util.lua b/test/lua/unit/rspamd_util.lua index db7a0af3f1..0acb942b26 100644 --- a/test/lua/unit/rspamd_util.lua +++ b/test/lua/unit/rspamd_util.lua @@ -72,6 +72,114 @@ context("Rspamd util for lua - check generic functions", function() assert_equal(res["digits"], 2) end) + -- Tests for get_text_quality + test("get_text_quality, empty string", function() + local res = util.get_text_quality("") + assert_equal(res.total, 0) + assert_equal(res.letters, 0) + assert_equal(res.words, 0) + end) + + test("get_text_quality, simple ASCII text", function() + local res = util.get_text_quality("Hello World") + assert_equal(res.total, 11) + assert_equal(res.letters, 10) + assert_equal(res.spaces, 1) + assert_equal(res.words, 2) + assert_equal(res.word_chars, 10) + assert_equal(res.uppercase, 2) -- H, W + assert_equal(res.lowercase, 8) + assert_equal(res.ascii_chars, 11) + assert_equal(res.non_ascii_chars, 0) + assert_equal(res.latin_vowels, 3) -- e, o, o + assert_equal(res.latin_consonants, 7) -- H, l, l, W, r, l, d + end) + + test("get_text_quality, Russian (Cyrillic) text", function() + local res = util.get_text_quality("Привет мир") + assert_equal(res.letters, 9) + assert_equal(res.spaces, 1) + assert_equal(res.words, 2) + assert_equal(res.non_ascii_chars, 9) -- all Cyrillic letters + assert_equal(res.ascii_chars, 1) -- space + assert_equal(res.latin_vowels, 0) + assert_equal(res.latin_consonants, 0) + assert_equal(res.script_transitions, 0) -- all same script + end) + + test("get_text_quality, mixed Latin and Cyrillic (script transitions)", function() + local res = util.get_text_quality("Hello Привет") + assert_equal(res.letters, 11) + assert_equal(res.words, 2) + assert_true(res.script_transitions > 0) -- at least one transition + assert_true(res.latin_vowels > 0) + assert_true(res.latin_consonants > 0) + end) + + test("get_text_quality, digits only", function() + local res = util.get_text_quality("12345") + assert_equal(res.total, 5) + assert_equal(res.digits, 5) + assert_equal(res.letters, 0) + assert_equal(res.words, 0) + assert_equal(res.printable, 5) + end) + + test("get_text_quality, punctuation", function() + local res = util.get_text_quality("Hello, World!") + assert_equal(res.punctuation, 2) -- comma and exclamation + assert_equal(res.words, 2) + end) + + test("get_text_quality, double spaces", function() + local res = util.get_text_quality("Hello World Test") + assert_equal(res.double_spaces, 3) -- 2 in " " + 1 extra in " " + assert_equal(res.spaces, 5) + end) + + test("get_text_quality, uppercase text", function() + local res = util.get_text_quality("HELLO WORLD") + assert_equal(res.uppercase, 10) + assert_equal(res.lowercase, 0) + end) + + test("get_text_quality, lowercase text", function() + local res = util.get_text_quality("hello world") + assert_equal(res.uppercase, 0) + assert_equal(res.lowercase, 10) + end) + + test("get_text_quality, single characters (no words)", function() + local res = util.get_text_quality("A B C D E") + assert_equal(res.letters, 5) + assert_equal(res.words, 0) -- single letters don't count as words + assert_equal(res.word_chars, 0) + end) + + test("get_text_quality, vowels vs consonants", function() + local res = util.get_text_quality("aeiou") + assert_equal(res.latin_vowels, 5) + assert_equal(res.latin_consonants, 0) + + res = util.get_text_quality("bcdfg") + assert_equal(res.latin_vowels, 0) + assert_equal(res.latin_consonants, 5) + end) + + test("get_text_quality, emojis", function() + local res = util.get_text_quality("Hello 👋 World") + assert_equal(res.emojis, 1) + assert_equal(res.words, 2) -- Hello and World + end) + + test("get_text_quality, mixed content", function() + local res = util.get_text_quality("Test123! Hello...") + assert_equal(res.letters, 9) -- Test + Hello + assert_equal(res.digits, 3) + assert_equal(res.punctuation, 4) -- ! and ... + assert_equal(res.words, 2) -- Test and Hello + end) + for i,c in ipairs(cases) do test("is_utf_mixed_script, test case #" .. i, function() local actual = util.is_utf_mixed_script(c.input)