]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add text quality analysis for PDF garbage filtering
authorVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 11 Dec 2025 18:11:36 +0000 (18:11 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 11 Dec 2025 18:11:36 +0000 (18:11 +0000)
- Add rspamd_util.get_text_quality() function with comprehensive UTF-8
  text analysis using ICU for proper Unicode classification
- Returns 18 metrics: letters, digits, punctuation, spaces, printable,
  words, word_chars, total, emojis, uppercase, lowercase, ascii_chars,
  non_ascii_chars, latin_vowels, latin_consonants, script_transitions,
  double_spaces, non_printable
- Add confidence scoring to PDF text extraction to filter garbage tokens
  (single characters, encoded data, random sequences)
- Configurable via text_quality_threshold, text_quality_min_length,
  text_quality_enabled options in pdf module config
- Add unit tests for get_text_quality function

lualib/lua_content/pdf.lua
src/lua/lua_util.c
test/lua/unit/rspamd_util.lua

index 550465c1a0009c3d6135554d919b028b51778b8f..2685779333d6b0f242121c06aed1c8f684077f88 100644 (file)
@@ -108,6 +108,10 @@ local config = {
   -- as partial results would be incorrect. Can be overridden via
   -- pdf.pdf_process_timeout in configuration.
   pdf_process_timeout = 2.0,
+  -- Text quality filtering options for garbage detection
+  text_quality_threshold = 0.4, -- Minimum confidence score (0.0-1.0) to accept extracted text
+  text_quality_min_length = 10, -- Minimum text length to apply quality filtering
+  text_quality_enabled = true, -- Enable/disable text quality filtering
 }
 
 -- Used to process patterns found in PDF
@@ -330,6 +334,73 @@ local function gen_graphics_nary()
       P("RG") + P("rg")
 end
 
+-- Calculate text quality confidence score using UTF-8 aware analysis
+-- Returns a score between 0.0 (garbage) and 1.0 (high quality text)
+local function calculate_text_confidence(text)
+  if not text or #text < config.text_quality_min_length then
+    return 1.0 -- Don't filter short text
+  end
+
+  local stats = rspamd_util.get_text_quality(text)
+  if not stats or stats.total == 0 then
+    return 0.0
+  end
+
+  local score = 0.0
+  local non_ws = stats.total - stats.spaces
+
+  -- Printable ratio (weight: 0.25) - target > 0.95
+  local printable_ratio = stats.printable / stats.total
+  score = score + math.min(printable_ratio / 0.95, 1.0) * 0.25
+
+  -- Letter ratio (weight: 0.20) - target > 0.6
+  local letter_ratio = 0
+  if non_ws > 0 then
+    letter_ratio = stats.letters / non_ws
+  end
+  score = score + math.min(letter_ratio / 0.6, 1.0) * 0.20
+
+  -- Word ratio (weight: 0.25) - target > 0.7
+  local word_ratio = 0
+  if non_ws > 0 then
+    word_ratio = stats.word_chars / non_ws
+  end
+  score = score + math.min(word_ratio / 0.7, 1.0) * 0.25
+
+  -- Average word length (weight: 0.15) - ideal: 3-10
+  local avg_word_len = 0
+  if stats.words > 0 then
+    avg_word_len = stats.word_chars / stats.words
+  end
+  local word_len_score = 0
+  if avg_word_len >= 3 and avg_word_len <= 10 then
+    word_len_score = 1.0
+  elseif avg_word_len >= 2 and avg_word_len < 3 then
+    word_len_score = 0.7
+  elseif avg_word_len > 10 and avg_word_len <= 15 then
+    word_len_score = 0.5
+  else
+    word_len_score = 0.2
+  end
+  score = score + word_len_score * 0.15
+
+  -- Space ratio (weight: 0.15) - ideal: 0.08-0.25
+  local space_ratio = stats.spaces / stats.total
+  local space_score = 0
+  if space_ratio >= 0.08 and space_ratio <= 0.25 then
+    space_score = 1.0
+  elseif space_ratio > 0.25 and space_ratio <= 0.4 then
+    space_score = 0.6
+  elseif space_ratio > 0 and space_ratio < 0.08 then
+    space_score = 0.5
+  else
+    space_score = 0.2
+  end
+  score = score + space_score * 0.15
+
+  return score
+end
+
 -- Generates a grammar to parse text blocks (between BT and ET)
 local function gen_text_grammar()
   local V = lpeg.V
@@ -442,6 +513,16 @@ local function gen_text_grammar()
 
     res = sanitize_pdf_text(res)
 
+    -- Apply text quality filtering to reject garbage chunks
+    if config.text_quality_enabled and res and #res >= config.text_quality_min_length then
+      local confidence = calculate_text_confidence(res)
+      if confidence < config.text_quality_threshold then
+        lua_util.debugm(N, nil, 'rejected low confidence text chunk (%.2f): %s',
+            confidence, res:sub(1, 50))
+        return ''
+      end
+    end
+
     if op == "'" or op == '"' then
       return '\n' .. res
     end
@@ -1398,10 +1479,24 @@ local function search_text(task, pdf, mpart)
           end
         end
         local res = table.concat(text, '')
-        obj.text = rspamd_text.fromstring(res)
 
-        lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
-            obj.major, obj.minor, obj.text)
+        -- Page-level confidence check before storing text
+        if config.text_quality_enabled and #res >= config.text_quality_min_length then
+          local page_confidence = calculate_text_confidence(res)
+          if page_confidence < config.text_quality_threshold then
+            lua_util.debugm(N, task, 'skipping low confidence page text for %s:%s (%.2f)',
+                obj.major, obj.minor, page_confidence)
+            -- Don't store this page's text
+          else
+            obj.text = rspamd_text.fromstring(res)
+            lua_util.debugm(N, task, 'object %s:%s is parsed (confidence: %.2f): %s',
+                obj.major, obj.minor, page_confidence, obj.text)
+          end
+        else
+          obj.text = rspamd_text.fromstring(res)
+          lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
+              obj.major, obj.minor, obj.text)
+        end
       end
     end
   end
index 24fcd7233d83618b36cda84d86ba13b0b5fa7b37..218fdf7fb20dea143a36c319934d550115d67a6b 100644 (file)
@@ -45,6 +45,7 @@
 
 #include "unicode/uspoof.h"
 #include "unicode/uscript.h"
+#include "unicode/uchar.h"
 #include <unicode/ucnv.h>
 #include "rspamd_simdutf.h"
 
@@ -519,6 +520,34 @@ LUA_FUNCTION_DEF(util, is_valid_utf8);
  */
 LUA_FUNCTION_DEF(util, has_obscured_unicode);
 
+/***
+ * @function util.get_text_quality(str)
+ * Analyzes text quality for UTF-8 strings, useful for filtering garbage text extracted from PDFs
+ * and other text quality analysis tasks. Uses ICU for proper Unicode character classification
+ * (supports all scripts).
+ * @param {string|rspamd_text} str input text to analyze
+ * @return {table} table with comprehensive text quality metrics:
+ *   - letters: count of Unicode letters (any script)
+ *   - digits: count of Unicode digits
+ *   - punctuation: count of punctuation characters
+ *   - spaces: count of whitespace characters
+ *   - printable: count of all printable characters
+ *   - words: count of word-like sequences (2+ consecutive letters)
+ *   - word_chars: total characters in words
+ *   - total: total character count
+ *   - emojis: count of emoji characters
+ *   - uppercase: count of uppercase letters
+ *   - lowercase: count of lowercase letters
+ *   - ascii_chars: count of ASCII characters (0-127)
+ *   - non_ascii_chars: count of non-ASCII characters
+ *   - latin_vowels: count of Latin vowels (a,e,i,o,u)
+ *   - latin_consonants: count of Latin consonants
+ *   - script_transitions: count of script changes (e.g., Latin to Cyrillic)
+ *   - double_spaces: count of consecutive space sequences
+ *   - non_printable: count of non-printable/invalid characters
+ */
+LUA_FUNCTION_DEF(util, get_text_quality);
+
 /***
  * @function util.readline([prompt])
  * Returns string read from stdin with history and editing support
@@ -779,6 +808,7 @@ static const struct luaL_reg utillib_f[] = {
        LUA_INTERFACE_DEF(util, get_string_stats),
        LUA_INTERFACE_DEF(util, is_valid_utf8),
        LUA_INTERFACE_DEF(util, has_obscured_unicode),
+       LUA_INTERFACE_DEF(util, get_text_quality),
        LUA_INTERFACE_DEF(util, readline),
        LUA_INTERFACE_DEF(util, readpassphrase),
        LUA_INTERFACE_DEF(util, file_exists),
@@ -2785,6 +2815,347 @@ lua_util_has_obscured_unicode(lua_State *L)
        return 1;
 }
 
+/* Helper to check if a character is a Latin vowel */
+static inline gboolean
+is_latin_vowel(UChar32 uc)
+{
+       /* Lowercase and uppercase Latin vowels */
+       return uc == 'a' || uc == 'e' || uc == 'i' || uc == 'o' || uc == 'u' ||
+                  uc == 'A' || uc == 'E' || uc == 'I' || uc == 'O' || uc == 'U';
+}
+
+static int
+lua_util_get_text_quality(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       int32_t i = 0;
+       UChar32 uc, prev_uc = 0;
+       UScriptCode prev_script = USCRIPT_INVALID_CODE;
+
+       /* Basic counts */
+       int letters = 0;
+       int spaces = 0;
+       int printable = 0;
+       int total = 0;
+       int words = 0;
+       int word_chars = 0;
+       int current_word_len = 0;
+       gboolean in_word = FALSE;
+
+       /* Extended metrics */
+       int digits = 0;
+       int punctuation = 0;
+       int emojis = 0;
+       int uppercase = 0;
+       int lowercase = 0;
+       int ascii_chars = 0;
+       int non_ascii_chars = 0;
+       int latin_vowels = 0;
+       int latin_consonants = 0;
+       int script_transitions = 0;
+       int double_spaces = 0;
+       int non_printable = 0;
+       gboolean prev_was_space = FALSE;
+
+       struct rspamd_lua_text *t = lua_check_text_or_string(L, 1);
+
+       if (t == NULL || t->len == 0) {
+               lua_createtable(L, 0, 18);
+               lua_pushstring(L, "letters");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "digits");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "punctuation");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "spaces");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "printable");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "words");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "word_chars");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "total");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "emojis");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "uppercase");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "lowercase");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "ascii_chars");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "non_ascii_chars");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "latin_vowels");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "latin_consonants");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "script_transitions");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "double_spaces");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               lua_pushstring(L, "non_printable");
+               lua_pushinteger(L, 0);
+               lua_settable(L, -3);
+               return 1;
+       }
+
+       while (i < t->len) {
+               U8_NEXT(t->start, i, t->len, uc);
+               total++;
+
+               if (uc < 0) {
+                       /* Invalid UTF-8 sequence */
+                       non_printable++;
+                       in_word = FALSE;
+                       if (current_word_len >= 2) {
+                               words++;
+                               word_chars += current_word_len;
+                       }
+                       current_word_len = 0;
+                       prev_was_space = FALSE;
+                       prev_script = USCRIPT_INVALID_CODE;
+                       continue;
+               }
+
+               /* ASCII vs non-ASCII */
+               if (uc <= 127) {
+                       ascii_chars++;
+               }
+               else {
+                       non_ascii_chars++;
+               }
+
+               /* Check for emoji */
+               if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) {
+                       emojis++;
+                       printable++;
+                       /* Emojis break words */
+                       if (in_word && current_word_len >= 2) {
+                               words++;
+                               word_chars += current_word_len;
+                       }
+                       current_word_len = 0;
+                       in_word = FALSE;
+                       prev_was_space = FALSE;
+                       prev_script = USCRIPT_INVALID_CODE;
+                       continue;
+               }
+
+               /* Check if it's a letter (any Unicode script) */
+               if (u_isalpha(uc)) {
+                       letters++;
+                       printable++;
+                       current_word_len++;
+                       in_word = TRUE;
+
+                       /* Case detection */
+                       if (u_isupper(uc)) {
+                               uppercase++;
+                       }
+                       else if (u_islower(uc)) {
+                               lowercase++;
+                       }
+
+                       /* Latin vowel/consonant detection */
+                       UScriptCode script = uscript_getScript(uc, NULL);
+                       if (script == USCRIPT_LATIN) {
+                               if (is_latin_vowel(uc)) {
+                                       latin_vowels++;
+                               }
+                               else {
+                                       latin_consonants++;
+                               }
+                       }
+
+                       /* Script transition detection (only for letters) */
+                       if (prev_script != USCRIPT_INVALID_CODE &&
+                               prev_script != USCRIPT_COMMON &&
+                               prev_script != USCRIPT_INHERITED &&
+                               script != USCRIPT_COMMON &&
+                               script != USCRIPT_INHERITED &&
+                               script != prev_script) {
+                               script_transitions++;
+                       }
+                       if (script != USCRIPT_COMMON && script != USCRIPT_INHERITED) {
+                               prev_script = script;
+                       }
+
+                       prev_was_space = FALSE;
+               }
+               else if (u_isdigit(uc)) {
+                       digits++;
+                       printable++;
+                       /* Digits break words for our purposes */
+                       if (in_word && current_word_len >= 2) {
+                               words++;
+                               word_chars += current_word_len;
+                       }
+                       current_word_len = 0;
+                       in_word = FALSE;
+                       prev_was_space = FALSE;
+               }
+               else if (u_isUWhiteSpace(uc)) {
+                       spaces++;
+                       printable++;
+
+                       /* Double space detection */
+                       if (prev_was_space) {
+                               double_spaces++;
+                       }
+                       prev_was_space = TRUE;
+
+                       /* End of word */
+                       if (in_word && current_word_len >= 2) {
+                               words++;
+                               word_chars += current_word_len;
+                       }
+                       current_word_len = 0;
+                       in_word = FALSE;
+               }
+               else if (u_ispunct(uc)) {
+                       punctuation++;
+                       printable++;
+                       /* Punctuation breaks words */
+                       if (in_word && current_word_len >= 2) {
+                               words++;
+                               word_chars += current_word_len;
+                       }
+                       current_word_len = 0;
+                       in_word = FALSE;
+                       prev_was_space = FALSE;
+               }
+               else if (u_isgraph(uc)) {
+                       /* Other printable characters (symbols, etc.) */
+                       printable++;
+                       if (in_word && current_word_len >= 2) {
+                               words++;
+                               word_chars += current_word_len;
+                       }
+                       current_word_len = 0;
+                       in_word = FALSE;
+                       prev_was_space = FALSE;
+               }
+               else {
+                       /* Non-printable characters */
+                       non_printable++;
+                       if (in_word && current_word_len >= 2) {
+                               words++;
+                               word_chars += current_word_len;
+                       }
+                       current_word_len = 0;
+                       in_word = FALSE;
+                       prev_was_space = FALSE;
+               }
+
+               prev_uc = uc;
+       }
+
+       /* Handle trailing word */
+       if (in_word && current_word_len >= 2) {
+               words++;
+               word_chars += current_word_len;
+       }
+
+       /* Suppress unused variable warning */
+       (void) prev_uc;
+
+       /* Build result table with all metrics */
+       lua_createtable(L, 0, 18);
+
+       lua_pushstring(L, "letters");
+       lua_pushinteger(L, letters);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "digits");
+       lua_pushinteger(L, digits);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "punctuation");
+       lua_pushinteger(L, punctuation);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "spaces");
+       lua_pushinteger(L, spaces);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "printable");
+       lua_pushinteger(L, printable);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "words");
+       lua_pushinteger(L, words);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "word_chars");
+       lua_pushinteger(L, word_chars);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "total");
+       lua_pushinteger(L, total);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "emojis");
+       lua_pushinteger(L, emojis);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "uppercase");
+       lua_pushinteger(L, uppercase);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "lowercase");
+       lua_pushinteger(L, lowercase);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "ascii_chars");
+       lua_pushinteger(L, ascii_chars);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "non_ascii_chars");
+       lua_pushinteger(L, non_ascii_chars);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "latin_vowels");
+       lua_pushinteger(L, latin_vowels);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "latin_consonants");
+       lua_pushinteger(L, latin_consonants);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "script_transitions");
+       lua_pushinteger(L, script_transitions);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "double_spaces");
+       lua_pushinteger(L, double_spaces);
+       lua_settable(L, -3);
+
+       lua_pushstring(L, "non_printable");
+       lua_pushinteger(L, non_printable);
+       lua_settable(L, -3);
+
+       return 1;
+}
+
 static int
 lua_util_readline(lua_State *L)
 {
index db7a0af3f197166306175848f66937e7a2eef03c..0acb942b268d3cb7b00c00a697dd1dbf45269428 100644 (file)
@@ -72,6 +72,114 @@ context("Rspamd util for lua - check generic functions", function()
         assert_equal(res["digits"], 2)
     end)
 
+    -- Tests for get_text_quality
+    test("get_text_quality, empty string", function()
+        local res = util.get_text_quality("")
+        assert_equal(res.total, 0)
+        assert_equal(res.letters, 0)
+        assert_equal(res.words, 0)
+    end)
+
+    test("get_text_quality, simple ASCII text", function()
+        local res = util.get_text_quality("Hello World")
+        assert_equal(res.total, 11)
+        assert_equal(res.letters, 10)
+        assert_equal(res.spaces, 1)
+        assert_equal(res.words, 2)
+        assert_equal(res.word_chars, 10)
+        assert_equal(res.uppercase, 2)  -- H, W
+        assert_equal(res.lowercase, 8)
+        assert_equal(res.ascii_chars, 11)
+        assert_equal(res.non_ascii_chars, 0)
+        assert_equal(res.latin_vowels, 3)  -- e, o, o
+        assert_equal(res.latin_consonants, 7)  -- H, l, l, W, r, l, d
+    end)
+
+    test("get_text_quality, Russian (Cyrillic) text", function()
+        local res = util.get_text_quality("Привет мир")
+        assert_equal(res.letters, 9)
+        assert_equal(res.spaces, 1)
+        assert_equal(res.words, 2)
+        assert_equal(res.non_ascii_chars, 9)  -- all Cyrillic letters
+        assert_equal(res.ascii_chars, 1)  -- space
+        assert_equal(res.latin_vowels, 0)
+        assert_equal(res.latin_consonants, 0)
+        assert_equal(res.script_transitions, 0)  -- all same script
+    end)
+
+    test("get_text_quality, mixed Latin and Cyrillic (script transitions)", function()
+        local res = util.get_text_quality("Hello Привет")
+        assert_equal(res.letters, 11)
+        assert_equal(res.words, 2)
+        assert_true(res.script_transitions > 0)  -- at least one transition
+        assert_true(res.latin_vowels > 0)
+        assert_true(res.latin_consonants > 0)
+    end)
+
+    test("get_text_quality, digits only", function()
+        local res = util.get_text_quality("12345")
+        assert_equal(res.total, 5)
+        assert_equal(res.digits, 5)
+        assert_equal(res.letters, 0)
+        assert_equal(res.words, 0)
+        assert_equal(res.printable, 5)
+    end)
+
+    test("get_text_quality, punctuation", function()
+        local res = util.get_text_quality("Hello, World!")
+        assert_equal(res.punctuation, 2)  -- comma and exclamation
+        assert_equal(res.words, 2)
+    end)
+
+    test("get_text_quality, double spaces", function()
+        local res = util.get_text_quality("Hello  World   Test")
+        assert_equal(res.double_spaces, 3)  -- 2 in "  " + 1 extra in "   "
+        assert_equal(res.spaces, 5)
+    end)
+
+    test("get_text_quality, uppercase text", function()
+        local res = util.get_text_quality("HELLO WORLD")
+        assert_equal(res.uppercase, 10)
+        assert_equal(res.lowercase, 0)
+    end)
+
+    test("get_text_quality, lowercase text", function()
+        local res = util.get_text_quality("hello world")
+        assert_equal(res.uppercase, 0)
+        assert_equal(res.lowercase, 10)
+    end)
+
+    test("get_text_quality, single characters (no words)", function()
+        local res = util.get_text_quality("A B C D E")
+        assert_equal(res.letters, 5)
+        assert_equal(res.words, 0)  -- single letters don't count as words
+        assert_equal(res.word_chars, 0)
+    end)
+
+    test("get_text_quality, vowels vs consonants", function()
+        local res = util.get_text_quality("aeiou")
+        assert_equal(res.latin_vowels, 5)
+        assert_equal(res.latin_consonants, 0)
+
+        res = util.get_text_quality("bcdfg")
+        assert_equal(res.latin_vowels, 0)
+        assert_equal(res.latin_consonants, 5)
+    end)
+
+    test("get_text_quality, emojis", function()
+        local res = util.get_text_quality("Hello 👋 World")
+        assert_equal(res.emojis, 1)
+        assert_equal(res.words, 2)  -- Hello and World
+    end)
+
+    test("get_text_quality, mixed content", function()
+        local res = util.get_text_quality("Test123! Hello...")
+        assert_equal(res.letters, 9)  -- Test + Hello
+        assert_equal(res.digits, 3)
+        assert_equal(res.punctuation, 4)  -- ! and ...
+        assert_equal(res.words, 2)  -- Test and Hello
+    end)
+
     for i,c in ipairs(cases) do
         test("is_utf_mixed_script, test case #" .. i, function()
           local actual = util.is_utf_mixed_script(c.input)