From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Thu, 11 Dec 2025 18:11:36 +0000 (+0000)
Subject: [Feature] Add text quality analysis for PDF garbage filtering
X-Git-Tag: 3.14.3~40^2~1
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b62818f14f5f6977b5bad6c79b52ab8315c4fea4;p=thirdparty%2Frspamd.git

[Feature] Add text quality analysis for PDF garbage filtering

- Add rspamd_util.get_text_quality() function with comprehensive UTF-8
  text analysis using ICU for proper Unicode classification
- Returns 18 metrics: letters, digits, punctuation, spaces, printable,
  words, word_chars, total, emojis, uppercase, lowercase, ascii_chars,
  non_ascii_chars, latin_vowels, latin_consonants, script_transitions,
  double_spaces, non_printable
- Add confidence scoring to PDF text extraction to filter garbage tokens
  (single characters, encoded data, random sequences)
- Configurable via text_quality_threshold, text_quality_min_length,
  text_quality_enabled options in pdf module config
- Add unit tests for get_text_quality function
---

diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua
index 550465c1a0..2685779333 100644
--- a/lualib/lua_content/pdf.lua
+++ b/lualib/lua_content/pdf.lua
@@ -108,6 +108,10 @@ local config = {
   -- as partial results would be incorrect. Can be overridden via
   -- pdf.pdf_process_timeout in configuration.
   pdf_process_timeout = 2.0,
+  -- Text quality filtering options for garbage detection
+  text_quality_threshold = 0.4, -- Minimum confidence score (0.0-1.0) to accept extracted text
+  text_quality_min_length = 10, -- Minimum text length to apply quality filtering
+  text_quality_enabled = true, -- Enable/disable text quality filtering
 }
 
 -- Used to process patterns found in PDF
@@ -330,6 +334,73 @@ local function gen_graphics_nary()
       P("RG") + P("rg")
 end
 
+-- Calculate text quality confidence score using UTF-8 aware analysis
+-- Returns a score between 0.0 (garbage) and 1.0 (high quality text)
+local function calculate_text_confidence(text)
+  if not text or #text < config.text_quality_min_length then
+    return 1.0 -- Don't filter short text
+  end
+
+  local stats = rspamd_util.get_text_quality(text)
+  if not stats or stats.total == 0 then
+    return 0.0
+  end
+
+  local score = 0.0
+  local non_ws = stats.total - stats.spaces
+
+  -- Printable ratio (weight: 0.25) - target > 0.95
+  local printable_ratio = stats.printable / stats.total
+  score = score + math.min(printable_ratio / 0.95, 1.0) * 0.25
+
+  -- Letter ratio (weight: 0.20) - target > 0.6
+  local letter_ratio = 0
+  if non_ws > 0 then
+    letter_ratio = stats.letters / non_ws
+  end
+  score = score + math.min(letter_ratio / 0.6, 1.0) * 0.20
+
+  -- Word ratio (weight: 0.25) - target > 0.7
+  local word_ratio = 0
+  if non_ws > 0 then
+    word_ratio = stats.word_chars / non_ws
+  end
+  score = score + math.min(word_ratio / 0.7, 1.0) * 0.25
+
+  -- Average word length (weight: 0.15) - ideal: 3-10
+  local avg_word_len = 0
+  if stats.words > 0 then
+    avg_word_len = stats.word_chars / stats.words
+  end
+  local word_len_score = 0
+  if avg_word_len >= 3 and avg_word_len <= 10 then
+    word_len_score = 1.0
+  elseif avg_word_len >= 2 and avg_word_len < 3 then
+    word_len_score = 0.7
+  elseif avg_word_len > 10 and avg_word_len <= 15 then
+    word_len_score = 0.5
+  else
+    word_len_score = 0.2
+  end
+  score = score + word_len_score * 0.15
+
+  -- Space ratio (weight: 0.15) - ideal: 0.08-0.25
+  local space_ratio = stats.spaces / stats.total
+  local space_score = 0
+  if space_ratio >= 0.08 and space_ratio <= 0.25 then
+    space_score = 1.0
+  elseif space_ratio > 0.25 and space_ratio <= 0.4 then
+    space_score = 0.6
+  elseif space_ratio > 0 and space_ratio < 0.08 then
+    space_score = 0.5
+  else
+    space_score = 0.2
+  end
+  score = score + space_score * 0.15
+
+  return score
+end
+
 -- Generates a grammar to parse text blocks (between BT and ET)
 local function gen_text_grammar()
   local V = lpeg.V
@@ -442,6 +513,16 @@ local function gen_text_grammar()
 
     res = sanitize_pdf_text(res)
 
+    -- Apply text quality filtering to reject garbage chunks
+    if config.text_quality_enabled and res and #res >= config.text_quality_min_length then
+      local confidence = calculate_text_confidence(res)
+      if confidence < config.text_quality_threshold then
+        lua_util.debugm(N, nil, 'rejected low confidence text chunk (%.2f): %s',
+            confidence, res:sub(1, 50))
+        return ''
+      end
+    end
+
     if op == "'" or op == '"' then
       return '\n' .. res
     end
@@ -1398,10 +1479,24 @@ local function search_text(task, pdf, mpart)
           end
         end
         local res = table.concat(text, '')
-        obj.text = rspamd_text.fromstring(res)
 
-        lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
-            obj.major, obj.minor, obj.text)
+        -- Page-level confidence check before storing text
+        if config.text_quality_enabled and #res >= config.text_quality_min_length then
+          local page_confidence = calculate_text_confidence(res)
+          if page_confidence < config.text_quality_threshold then
+            lua_util.debugm(N, task, 'skipping low confidence page text for %s:%s (%.2f)',
+                obj.major, obj.minor, page_confidence)
+            -- Don't store this page's text
+          else
+            obj.text = rspamd_text.fromstring(res)
+            lua_util.debugm(N, task, 'object %s:%s is parsed (confidence: %.2f): %s',
+                obj.major, obj.minor, page_confidence, obj.text)
+          end
+        else
+          obj.text = rspamd_text.fromstring(res)
+          lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
+              obj.major, obj.minor, obj.text)
+        end
       end
     end
   end
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 24fcd7233d..218fdf7fb2 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -45,6 +45,7 @@
 
 #include "unicode/uspoof.h"
 #include "unicode/uscript.h"
+#include "unicode/uchar.h"
 #include <unicode/ucnv.h>
 #include "rspamd_simdutf.h"
 
@@ -519,6 +520,34 @@ LUA_FUNCTION_DEF(util, is_valid_utf8);
  */
 LUA_FUNCTION_DEF(util, has_obscured_unicode);
 
+/***
+ * @function util.get_text_quality(str)
+ * Analyzes text quality for UTF-8 strings, useful for filtering garbage text extracted from PDFs
+ * and other text quality analysis tasks. Uses ICU for proper Unicode character classification
+ * (supports all scripts).
+ * @param {string|rspamd_text} str input text to analyze
+ * @return {table} table with comprehensive text quality metrics:
+ *   - letters: count of Unicode letters (any script)
+ *   - digits: count of Unicode digits
+ *   - punctuation: count of punctuation characters
+ *   - spaces: count of whitespace characters
+ *   - printable: count of all printable characters
+ *   - words: count of word-like sequences (2+ consecutive letters)
+ *   - word_chars: total characters in words
+ *   - total: total character count
+ *   - emojis: count of emoji characters
+ *   - uppercase: count of uppercase letters
+ *   - lowercase: count of lowercase letters
+ *   - ascii_chars: count of ASCII characters (0-127)
+ *   - non_ascii_chars: count of non-ASCII characters
+ *   - latin_vowels: count of Latin vowels (a,e,i,o,u)
+ *   - latin_consonants: count of Latin consonants
+ *   - script_transitions: count of script changes (e.g., Latin to Cyrillic)
+ *   - double_spaces: count of consecutive space sequences
+ *   - non_printable: count of non-printable/invalid characters
+ */
+LUA_FUNCTION_DEF(util, get_text_quality);
+
 /***
  * @function util.readline([prompt])
  * Returns string read from stdin with history and editing support
@@ -779,6 +808,7 @@ static const struct luaL_reg utillib_f[] = {
 	LUA_INTERFACE_DEF(util, get_string_stats),
 	LUA_INTERFACE_DEF(util, is_valid_utf8),
 	LUA_INTERFACE_DEF(util, has_obscured_unicode),
+	LUA_INTERFACE_DEF(util, get_text_quality),
 	LUA_INTERFACE_DEF(util, readline),
 	LUA_INTERFACE_DEF(util, readpassphrase),
 	LUA_INTERFACE_DEF(util, file_exists),
@@ -2785,6 +2815,347 @@ lua_util_has_obscured_unicode(lua_State *L)
 	return 1;
 }
 
+/* Helper to check if a character is a Latin vowel */
+static inline gboolean
+is_latin_vowel(UChar32 uc)
+{
+	/* Lowercase and uppercase Latin vowels */
+	return uc == 'a' || uc == 'e' || uc == 'i' || uc == 'o' || uc == 'u' ||
+		   uc == 'A' || uc == 'E' || uc == 'I' || uc == 'O' || uc == 'U';
+}
+
+static int
+lua_util_get_text_quality(lua_State *L)
+{
+	LUA_TRACE_POINT;
+	int32_t i = 0;
+	UChar32 uc, prev_uc = 0;
+	UScriptCode prev_script = USCRIPT_INVALID_CODE;
+
+	/* Basic counts */
+	int letters = 0;
+	int spaces = 0;
+	int printable = 0;
+	int total = 0;
+	int words = 0;
+	int word_chars = 0;
+	int current_word_len = 0;
+	gboolean in_word = FALSE;
+
+	/* Extended metrics */
+	int digits = 0;
+	int punctuation = 0;
+	int emojis = 0;
+	int uppercase = 0;
+	int lowercase = 0;
+	int ascii_chars = 0;
+	int non_ascii_chars = 0;
+	int latin_vowels = 0;
+	int latin_consonants = 0;
+	int script_transitions = 0;
+	int double_spaces = 0;
+	int non_printable = 0;
+	gboolean prev_was_space = FALSE;
+
+	struct rspamd_lua_text *t = lua_check_text_or_string(L, 1);
+
+	if (t == NULL || t->len == 0) {
+		lua_createtable(L, 0, 18);
+		lua_pushstring(L, "letters");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "digits");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "punctuation");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "spaces");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "printable");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "words");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "word_chars");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "total");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "emojis");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "uppercase");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "lowercase");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "ascii_chars");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "non_ascii_chars");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "latin_vowels");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "latin_consonants");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "script_transitions");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "double_spaces");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		lua_pushstring(L, "non_printable");
+		lua_pushinteger(L, 0);
+		lua_settable(L, -3);
+		return 1;
+	}
+
+	while (i < t->len) {
+		U8_NEXT(t->start, i, t->len, uc);
+		total++;
+
+		if (uc < 0) {
+			/* Invalid UTF-8 sequence */
+			non_printable++;
+			in_word = FALSE;
+			if (current_word_len >= 2) {
+				words++;
+				word_chars += current_word_len;
+			}
+			current_word_len = 0;
+			prev_was_space = FALSE;
+			prev_script = USCRIPT_INVALID_CODE;
+			continue;
+		}
+
+		/* ASCII vs non-ASCII */
+		if (uc <= 127) {
+			ascii_chars++;
+		}
+		else {
+			non_ascii_chars++;
+		}
+
+		/* Check for emoji */
+		if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) {
+			emojis++;
+			printable++;
+			/* Emojis break words */
+			if (in_word && current_word_len >= 2) {
+				words++;
+				word_chars += current_word_len;
+			}
+			current_word_len = 0;
+			in_word = FALSE;
+			prev_was_space = FALSE;
+			prev_script = USCRIPT_INVALID_CODE;
+			continue;
+		}
+
+		/* Check if it's a letter (any Unicode script) */
+		if (u_isalpha(uc)) {
+			letters++;
+			printable++;
+			current_word_len++;
+			in_word = TRUE;
+
+			/* Case detection */
+			if (u_isupper(uc)) {
+				uppercase++;
+			}
+			else if (u_islower(uc)) {
+				lowercase++;
+			}
+
+			/* Latin vowel/consonant detection */
+			UScriptCode script = uscript_getScript(uc, NULL);
+			if (script == USCRIPT_LATIN) {
+				if (is_latin_vowel(uc)) {
+					latin_vowels++;
+				}
+				else {
+					latin_consonants++;
+				}
+			}
+
+			/* Script transition detection (only for letters) */
+			if (prev_script != USCRIPT_INVALID_CODE &&
+				prev_script != USCRIPT_COMMON &&
+				prev_script != USCRIPT_INHERITED &&
+				script != USCRIPT_COMMON &&
+				script != USCRIPT_INHERITED &&
+				script != prev_script) {
+				script_transitions++;
+			}
+			if (script != USCRIPT_COMMON && script != USCRIPT_INHERITED) {
+				prev_script = script;
+			}
+
+			prev_was_space = FALSE;
+		}
+		else if (u_isdigit(uc)) {
+			digits++;
+			printable++;
+			/* Digits break words for our purposes */
+			if (in_word && current_word_len >= 2) {
+				words++;
+				word_chars += current_word_len;
+			}
+			current_word_len = 0;
+			in_word = FALSE;
+			prev_was_space = FALSE;
+		}
+		else if (u_isUWhiteSpace(uc)) {
+			spaces++;
+			printable++;
+
+			/* Double space detection */
+			if (prev_was_space) {
+				double_spaces++;
+			}
+			prev_was_space = TRUE;
+
+			/* End of word */
+			if (in_word && current_word_len >= 2) {
+				words++;
+				word_chars += current_word_len;
+			}
+			current_word_len = 0;
+			in_word = FALSE;
+		}
+		else if (u_ispunct(uc)) {
+			punctuation++;
+			printable++;
+			/* Punctuation breaks words */
+			if (in_word && current_word_len >= 2) {
+				words++;
+				word_chars += current_word_len;
+			}
+			current_word_len = 0;
+			in_word = FALSE;
+			prev_was_space = FALSE;
+		}
+		else if (u_isgraph(uc)) {
+			/* Other printable characters (symbols, etc.) */
+			printable++;
+			if (in_word && current_word_len >= 2) {
+				words++;
+				word_chars += current_word_len;
+			}
+			current_word_len = 0;
+			in_word = FALSE;
+			prev_was_space = FALSE;
+		}
+		else {
+			/* Non-printable characters */
+			non_printable++;
+			if (in_word && current_word_len >= 2) {
+				words++;
+				word_chars += current_word_len;
+			}
+			current_word_len = 0;
+			in_word = FALSE;
+			prev_was_space = FALSE;
+		}
+
+		prev_uc = uc;
+	}
+
+	/* Handle trailing word */
+	if (in_word && current_word_len >= 2) {
+		words++;
+		word_chars += current_word_len;
+	}
+
+	/* Suppress unused variable warning */
+	(void) prev_uc;
+
+	/* Build result table with all metrics */
+	lua_createtable(L, 0, 18);
+
+	lua_pushstring(L, "letters");
+	lua_pushinteger(L, letters);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "digits");
+	lua_pushinteger(L, digits);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "punctuation");
+	lua_pushinteger(L, punctuation);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "spaces");
+	lua_pushinteger(L, spaces);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "printable");
+	lua_pushinteger(L, printable);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "words");
+	lua_pushinteger(L, words);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "word_chars");
+	lua_pushinteger(L, word_chars);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "total");
+	lua_pushinteger(L, total);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "emojis");
+	lua_pushinteger(L, emojis);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "uppercase");
+	lua_pushinteger(L, uppercase);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "lowercase");
+	lua_pushinteger(L, lowercase);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "ascii_chars");
+	lua_pushinteger(L, ascii_chars);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "non_ascii_chars");
+	lua_pushinteger(L, non_ascii_chars);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "latin_vowels");
+	lua_pushinteger(L, latin_vowels);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "latin_consonants");
+	lua_pushinteger(L, latin_consonants);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "script_transitions");
+	lua_pushinteger(L, script_transitions);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "double_spaces");
+	lua_pushinteger(L, double_spaces);
+	lua_settable(L, -3);
+
+	lua_pushstring(L, "non_printable");
+	lua_pushinteger(L, non_printable);
+	lua_settable(L, -3);
+
+	return 1;
+}
+
 static int
 lua_util_readline(lua_State *L)
 {
diff --git a/test/lua/unit/rspamd_util.lua b/test/lua/unit/rspamd_util.lua
index db7a0af3f1..0acb942b26 100644
--- a/test/lua/unit/rspamd_util.lua
+++ b/test/lua/unit/rspamd_util.lua
@@ -72,6 +72,114 @@ context("Rspamd util for lua - check generic functions", function()
         assert_equal(res["digits"], 2)
     end)
 
+    -- Tests for get_text_quality
+    test("get_text_quality, empty string", function()
+        local res = util.get_text_quality("")
+        assert_equal(res.total, 0)
+        assert_equal(res.letters, 0)
+        assert_equal(res.words, 0)
+    end)
+
+    test("get_text_quality, simple ASCII text", function()
+        local res = util.get_text_quality("Hello World")
+        assert_equal(res.total, 11)
+        assert_equal(res.letters, 10)
+        assert_equal(res.spaces, 1)
+        assert_equal(res.words, 2)
+        assert_equal(res.word_chars, 10)
+        assert_equal(res.uppercase, 2)  -- H, W
+        assert_equal(res.lowercase, 8)
+        assert_equal(res.ascii_chars, 11)
+        assert_equal(res.non_ascii_chars, 0)
+        assert_equal(res.latin_vowels, 3)  -- e, o, o
+        assert_equal(res.latin_consonants, 7)  -- H, l, l, W, r, l, d
+    end)
+
+    test("get_text_quality, Russian (Cyrillic) text", function()
+        local res = util.get_text_quality("ÐÑÐ¸Ð²ÐµÑ Ð¼Ð¸Ñ")
+        assert_equal(res.letters, 9)
+        assert_equal(res.spaces, 1)
+        assert_equal(res.words, 2)
+        assert_equal(res.non_ascii_chars, 9)  -- all Cyrillic letters
+        assert_equal(res.ascii_chars, 1)  -- space
+        assert_equal(res.latin_vowels, 0)
+        assert_equal(res.latin_consonants, 0)
+        assert_equal(res.script_transitions, 0)  -- all same script
+    end)
+
+    test("get_text_quality, mixed Latin and Cyrillic (script transitions)", function()
+        local res = util.get_text_quality("Hello ÐÑÐ¸Ð²ÐµÑ")
+        assert_equal(res.letters, 11)
+        assert_equal(res.words, 2)
+        assert_true(res.script_transitions > 0)  -- at least one transition
+        assert_true(res.latin_vowels > 0)
+        assert_true(res.latin_consonants > 0)
+    end)
+
+    test("get_text_quality, digits only", function()
+        local res = util.get_text_quality("12345")
+        assert_equal(res.total, 5)
+        assert_equal(res.digits, 5)
+        assert_equal(res.letters, 0)
+        assert_equal(res.words, 0)
+        assert_equal(res.printable, 5)
+    end)
+
+    test("get_text_quality, punctuation", function()
+        local res = util.get_text_quality("Hello, World!")
+        assert_equal(res.punctuation, 2)  -- comma and exclamation
+        assert_equal(res.words, 2)
+    end)
+
+    test("get_text_quality, double spaces", function()
+        local res = util.get_text_quality("Hello  World   Test")
+        assert_equal(res.double_spaces, 3)  -- 2 in "  " + 1 extra in "   "
+        assert_equal(res.spaces, 5)
+    end)
+
+    test("get_text_quality, uppercase text", function()
+        local res = util.get_text_quality("HELLO WORLD")
+        assert_equal(res.uppercase, 10)
+        assert_equal(res.lowercase, 0)
+    end)
+
+    test("get_text_quality, lowercase text", function()
+        local res = util.get_text_quality("hello world")
+        assert_equal(res.uppercase, 0)
+        assert_equal(res.lowercase, 10)
+    end)
+
+    test("get_text_quality, single characters (no words)", function()
+        local res = util.get_text_quality("A B C D E")
+        assert_equal(res.letters, 5)
+        assert_equal(res.words, 0)  -- single letters don't count as words
+        assert_equal(res.word_chars, 0)
+    end)
+
+    test("get_text_quality, vowels vs consonants", function()
+        local res = util.get_text_quality("aeiou")
+        assert_equal(res.latin_vowels, 5)
+        assert_equal(res.latin_consonants, 0)
+
+        res = util.get_text_quality("bcdfg")
+        assert_equal(res.latin_vowels, 0)
+        assert_equal(res.latin_consonants, 5)
+    end)
+
+    test("get_text_quality, emojis", function()
+        local res = util.get_text_quality("Hello ð World")
+        assert_equal(res.emojis, 1)
+        assert_equal(res.words, 2)  -- Hello and World
+    end)
+
+    test("get_text_quality, mixed content", function()
+        local res = util.get_text_quality("Test123! Hello...")
+        assert_equal(res.letters, 9)  -- Test + Hello
+        assert_equal(res.digits, 3)
+        assert_equal(res.punctuation, 4)  -- ! and ...
+        assert_equal(res.words, 2)  -- Test and Hello
+    end)
+
     for i,c in ipairs(cases) do
         test("is_utf_mixed_script, test case #" .. i, function()
           local actual = util.is_utf_mixed_script(c.input)