-- as partial results would be incorrect. Can be overridden via
-- pdf.pdf_process_timeout in configuration.
pdf_process_timeout = 2.0,
+ -- Text quality filtering options for garbage detection
+ text_quality_threshold = 0.4, -- Minimum confidence score (0.0-1.0) to accept extracted text
+ text_quality_min_length = 10, -- Minimum text length to apply quality filtering
+ text_quality_enabled = true, -- Enable/disable text quality filtering
}
-- Used to process patterns found in PDF
P("RG") + P("rg")
end
+-- Calculate text quality confidence score using UTF-8 aware analysis
+-- Returns a score between 0.0 (garbage) and 1.0 (high quality text)
+local function calculate_text_confidence(text)
+ if not text or #text < config.text_quality_min_length then
+ return 1.0 -- Don't filter short text
+ end
+
+ local stats = rspamd_util.get_text_quality(text)
+ if not stats or stats.total == 0 then
+ return 0.0
+ end
+
+ local score = 0.0
+ local non_ws = stats.total - stats.spaces
+
+ -- Printable ratio (weight: 0.25) - target > 0.95
+ local printable_ratio = stats.printable / stats.total
+ score = score + math.min(printable_ratio / 0.95, 1.0) * 0.25
+
+ -- Letter ratio (weight: 0.20) - target > 0.6
+ local letter_ratio = 0
+ if non_ws > 0 then
+ letter_ratio = stats.letters / non_ws
+ end
+ score = score + math.min(letter_ratio / 0.6, 1.0) * 0.20
+
+ -- Word ratio (weight: 0.25) - target > 0.7
+ local word_ratio = 0
+ if non_ws > 0 then
+ word_ratio = stats.word_chars / non_ws
+ end
+ score = score + math.min(word_ratio / 0.7, 1.0) * 0.25
+
+ -- Average word length (weight: 0.15) - ideal: 3-10
+ local avg_word_len = 0
+ if stats.words > 0 then
+ avg_word_len = stats.word_chars / stats.words
+ end
+ local word_len_score = 0
+ if avg_word_len >= 3 and avg_word_len <= 10 then
+ word_len_score = 1.0
+ elseif avg_word_len >= 2 and avg_word_len < 3 then
+ word_len_score = 0.7
+ elseif avg_word_len > 10 and avg_word_len <= 15 then
+ word_len_score = 0.5
+ else
+ word_len_score = 0.2
+ end
+ score = score + word_len_score * 0.15
+
+ -- Space ratio (weight: 0.15) - ideal: 0.08-0.25
+ local space_ratio = stats.spaces / stats.total
+ local space_score = 0
+ if space_ratio >= 0.08 and space_ratio <= 0.25 then
+ space_score = 1.0
+ elseif space_ratio > 0.25 and space_ratio <= 0.4 then
+ space_score = 0.6
+ elseif space_ratio > 0 and space_ratio < 0.08 then
+ space_score = 0.5
+ else
+ space_score = 0.2
+ end
+ score = score + space_score * 0.15
+
+ return score
+end
+
-- Generates a grammar to parse text blocks (between BT and ET)
local function gen_text_grammar()
local V = lpeg.V
res = sanitize_pdf_text(res)
+ -- Apply text quality filtering to reject garbage chunks
+ if config.text_quality_enabled and res and #res >= config.text_quality_min_length then
+ local confidence = calculate_text_confidence(res)
+ if confidence < config.text_quality_threshold then
+ lua_util.debugm(N, nil, 'rejected low confidence text chunk (%.2f): %s',
+ confidence, res:sub(1, 50))
+ return ''
+ end
+ end
+
if op == "'" or op == '"' then
return '\n' .. res
end
end
end
local res = table.concat(text, '')
- obj.text = rspamd_text.fromstring(res)
- lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
- obj.major, obj.minor, obj.text)
+ -- Page-level confidence check before storing text
+ if config.text_quality_enabled and #res >= config.text_quality_min_length then
+ local page_confidence = calculate_text_confidence(res)
+ if page_confidence < config.text_quality_threshold then
+ lua_util.debugm(N, task, 'skipping low confidence page text for %s:%s (%.2f)',
+ obj.major, obj.minor, page_confidence)
+ -- Don't store this page's text
+ else
+ obj.text = rspamd_text.fromstring(res)
+ lua_util.debugm(N, task, 'object %s:%s is parsed (confidence: %.2f): %s',
+ obj.major, obj.minor, page_confidence, obj.text)
+ end
+ else
+ obj.text = rspamd_text.fromstring(res)
+ lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
+ obj.major, obj.minor, obj.text)
+ end
end
end
end
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
+#include "unicode/uchar.h"
#include <unicode/ucnv.h>
#include "rspamd_simdutf.h"
*/
LUA_FUNCTION_DEF(util, has_obscured_unicode);
+/***
+ * @function util.get_text_quality(str)
+ * Analyzes text quality for UTF-8 strings, useful for filtering garbage text extracted from PDFs
+ * and other text quality analysis tasks. Uses ICU for proper Unicode character classification
+ * (supports all scripts).
+ * @param {string|rspamd_text} str input text to analyze
+ * @return {table} table with comprehensive text quality metrics:
+ * - letters: count of Unicode letters (any script)
+ * - digits: count of Unicode digits
+ * - punctuation: count of punctuation characters
+ * - spaces: count of whitespace characters
+ * - printable: count of all printable characters
+ * - words: count of word-like sequences (2+ consecutive letters)
+ * - word_chars: total characters in words
+ * - total: total character count
+ * - emojis: count of emoji characters
+ * - uppercase: count of uppercase letters
+ * - lowercase: count of lowercase letters
+ * - ascii_chars: count of ASCII characters (0-127)
+ * - non_ascii_chars: count of non-ASCII characters
+ * - latin_vowels: count of Latin vowels (a,e,i,o,u)
+ * - latin_consonants: count of Latin consonants
+ * - script_transitions: count of script changes (e.g., Latin to Cyrillic)
+ * - double_spaces: count of consecutive space sequences
+ * - non_printable: count of non-printable/invalid characters
+ */
+LUA_FUNCTION_DEF(util, get_text_quality);
+
/***
* @function util.readline([prompt])
* Returns string read from stdin with history and editing support
LUA_INTERFACE_DEF(util, get_string_stats),
LUA_INTERFACE_DEF(util, is_valid_utf8),
LUA_INTERFACE_DEF(util, has_obscured_unicode),
+ LUA_INTERFACE_DEF(util, get_text_quality),
LUA_INTERFACE_DEF(util, readline),
LUA_INTERFACE_DEF(util, readpassphrase),
LUA_INTERFACE_DEF(util, file_exists),
return 1;
}
+/* Helper to check if a character is a Latin vowel */
+static inline gboolean
+is_latin_vowel(UChar32 uc)
+{
+ /* Lowercase and uppercase Latin vowels */
+ return uc == 'a' || uc == 'e' || uc == 'i' || uc == 'o' || uc == 'u' ||
+ uc == 'A' || uc == 'E' || uc == 'I' || uc == 'O' || uc == 'U';
+}
+
+static int
+lua_util_get_text_quality(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ int32_t i = 0;
+ UChar32 uc, prev_uc = 0;
+ UScriptCode prev_script = USCRIPT_INVALID_CODE;
+
+ /* Basic counts */
+ int letters = 0;
+ int spaces = 0;
+ int printable = 0;
+ int total = 0;
+ int words = 0;
+ int word_chars = 0;
+ int current_word_len = 0;
+ gboolean in_word = FALSE;
+
+ /* Extended metrics */
+ int digits = 0;
+ int punctuation = 0;
+ int emojis = 0;
+ int uppercase = 0;
+ int lowercase = 0;
+ int ascii_chars = 0;
+ int non_ascii_chars = 0;
+ int latin_vowels = 0;
+ int latin_consonants = 0;
+ int script_transitions = 0;
+ int double_spaces = 0;
+ int non_printable = 0;
+ gboolean prev_was_space = FALSE;
+
+ struct rspamd_lua_text *t = lua_check_text_or_string(L, 1);
+
+ if (t == NULL || t->len == 0) {
+ lua_createtable(L, 0, 18);
+ lua_pushstring(L, "letters");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "digits");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "punctuation");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "spaces");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "printable");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "words");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "word_chars");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "total");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "emojis");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "uppercase");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "lowercase");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "ascii_chars");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "non_ascii_chars");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "latin_vowels");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "latin_consonants");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "script_transitions");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "double_spaces");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ lua_pushstring(L, "non_printable");
+ lua_pushinteger(L, 0);
+ lua_settable(L, -3);
+ return 1;
+ }
+
+ while (i < t->len) {
+ U8_NEXT(t->start, i, t->len, uc);
+ total++;
+
+ if (uc < 0) {
+ /* Invalid UTF-8 sequence */
+ non_printable++;
+ in_word = FALSE;
+ if (current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ prev_was_space = FALSE;
+ prev_script = USCRIPT_INVALID_CODE;
+ continue;
+ }
+
+ /* ASCII vs non-ASCII */
+ if (uc <= 127) {
+ ascii_chars++;
+ }
+ else {
+ non_ascii_chars++;
+ }
+
+ /* Check for emoji */
+ if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) {
+ emojis++;
+ printable++;
+ /* Emojis break words */
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ in_word = FALSE;
+ prev_was_space = FALSE;
+ prev_script = USCRIPT_INVALID_CODE;
+ continue;
+ }
+
+ /* Check if it's a letter (any Unicode script) */
+ if (u_isalpha(uc)) {
+ letters++;
+ printable++;
+ current_word_len++;
+ in_word = TRUE;
+
+ /* Case detection */
+ if (u_isupper(uc)) {
+ uppercase++;
+ }
+ else if (u_islower(uc)) {
+ lowercase++;
+ }
+
+ /* Latin vowel/consonant detection */
+ UScriptCode script = uscript_getScript(uc, NULL);
+ if (script == USCRIPT_LATIN) {
+ if (is_latin_vowel(uc)) {
+ latin_vowels++;
+ }
+ else {
+ latin_consonants++;
+ }
+ }
+
+ /* Script transition detection (only for letters) */
+ if (prev_script != USCRIPT_INVALID_CODE &&
+ prev_script != USCRIPT_COMMON &&
+ prev_script != USCRIPT_INHERITED &&
+ script != USCRIPT_COMMON &&
+ script != USCRIPT_INHERITED &&
+ script != prev_script) {
+ script_transitions++;
+ }
+ if (script != USCRIPT_COMMON && script != USCRIPT_INHERITED) {
+ prev_script = script;
+ }
+
+ prev_was_space = FALSE;
+ }
+ else if (u_isdigit(uc)) {
+ digits++;
+ printable++;
+ /* Digits break words for our purposes */
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ in_word = FALSE;
+ prev_was_space = FALSE;
+ }
+ else if (u_isUWhiteSpace(uc)) {
+ spaces++;
+ printable++;
+
+ /* Double space detection */
+ if (prev_was_space) {
+ double_spaces++;
+ }
+ prev_was_space = TRUE;
+
+ /* End of word */
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ in_word = FALSE;
+ }
+ else if (u_ispunct(uc)) {
+ punctuation++;
+ printable++;
+ /* Punctuation breaks words */
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ in_word = FALSE;
+ prev_was_space = FALSE;
+ }
+ else if (u_isgraph(uc)) {
+ /* Other printable characters (symbols, etc.) */
+ printable++;
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ in_word = FALSE;
+ prev_was_space = FALSE;
+ }
+ else {
+ /* Non-printable characters */
+ non_printable++;
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ in_word = FALSE;
+ prev_was_space = FALSE;
+ }
+
+ prev_uc = uc;
+ }
+
+ /* Handle trailing word */
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+
+ /* Suppress unused variable warning */
+ (void) prev_uc;
+
+ /* Build result table with all metrics */
+ lua_createtable(L, 0, 18);
+
+ lua_pushstring(L, "letters");
+ lua_pushinteger(L, letters);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "digits");
+ lua_pushinteger(L, digits);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "punctuation");
+ lua_pushinteger(L, punctuation);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "spaces");
+ lua_pushinteger(L, spaces);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "printable");
+ lua_pushinteger(L, printable);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "words");
+ lua_pushinteger(L, words);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "word_chars");
+ lua_pushinteger(L, word_chars);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "total");
+ lua_pushinteger(L, total);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "emojis");
+ lua_pushinteger(L, emojis);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "uppercase");
+ lua_pushinteger(L, uppercase);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "lowercase");
+ lua_pushinteger(L, lowercase);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "ascii_chars");
+ lua_pushinteger(L, ascii_chars);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "non_ascii_chars");
+ lua_pushinteger(L, non_ascii_chars);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "latin_vowels");
+ lua_pushinteger(L, latin_vowels);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "latin_consonants");
+ lua_pushinteger(L, latin_consonants);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "script_transitions");
+ lua_pushinteger(L, script_transitions);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "double_spaces");
+ lua_pushinteger(L, double_spaces);
+ lua_settable(L, -3);
+
+ lua_pushstring(L, "non_printable");
+ lua_pushinteger(L, non_printable);
+ lua_settable(L, -3);
+
+ return 1;
+}
+
static int
lua_util_readline(lua_State *L)
{
assert_equal(res["digits"], 2)
end)
+ -- Tests for get_text_quality
+ test("get_text_quality, empty string", function()
+ local res = util.get_text_quality("")
+ assert_equal(res.total, 0)
+ assert_equal(res.letters, 0)
+ assert_equal(res.words, 0)
+ end)
+
+ test("get_text_quality, simple ASCII text", function()
+ local res = util.get_text_quality("Hello World")
+ assert_equal(res.total, 11)
+ assert_equal(res.letters, 10)
+ assert_equal(res.spaces, 1)
+ assert_equal(res.words, 2)
+ assert_equal(res.word_chars, 10)
+ assert_equal(res.uppercase, 2) -- H, W
+ assert_equal(res.lowercase, 8)
+ assert_equal(res.ascii_chars, 11)
+ assert_equal(res.non_ascii_chars, 0)
+ assert_equal(res.latin_vowels, 3) -- e, o, o
+ assert_equal(res.latin_consonants, 7) -- H, l, l, W, r, l, d
+ end)
+
+ test("get_text_quality, Russian (Cyrillic) text", function()
+ local res = util.get_text_quality("Привет мир")
+ assert_equal(res.letters, 9)
+ assert_equal(res.spaces, 1)
+ assert_equal(res.words, 2)
+ assert_equal(res.non_ascii_chars, 9) -- all Cyrillic letters
+ assert_equal(res.ascii_chars, 1) -- space
+ assert_equal(res.latin_vowels, 0)
+ assert_equal(res.latin_consonants, 0)
+ assert_equal(res.script_transitions, 0) -- all same script
+ end)
+
+ test("get_text_quality, mixed Latin and Cyrillic (script transitions)", function()
+ local res = util.get_text_quality("Hello Привет")
+ assert_equal(res.letters, 11)
+ assert_equal(res.words, 2)
+ assert_true(res.script_transitions > 0) -- at least one transition
+ assert_true(res.latin_vowels > 0)
+ assert_true(res.latin_consonants > 0)
+ end)
+
+ test("get_text_quality, digits only", function()
+ local res = util.get_text_quality("12345")
+ assert_equal(res.total, 5)
+ assert_equal(res.digits, 5)
+ assert_equal(res.letters, 0)
+ assert_equal(res.words, 0)
+ assert_equal(res.printable, 5)
+ end)
+
+ test("get_text_quality, punctuation", function()
+ local res = util.get_text_quality("Hello, World!")
+ assert_equal(res.punctuation, 2) -- comma and exclamation
+ assert_equal(res.words, 2)
+ end)
+
+ test("get_text_quality, double spaces", function()
+ local res = util.get_text_quality("Hello World Test")
+ assert_equal(res.double_spaces, 3) -- 2 in " " + 1 extra in " "
+ assert_equal(res.spaces, 5)
+ end)
+
+ test("get_text_quality, uppercase text", function()
+ local res = util.get_text_quality("HELLO WORLD")
+ assert_equal(res.uppercase, 10)
+ assert_equal(res.lowercase, 0)
+ end)
+
+ test("get_text_quality, lowercase text", function()
+ local res = util.get_text_quality("hello world")
+ assert_equal(res.uppercase, 0)
+ assert_equal(res.lowercase, 10)
+ end)
+
+ test("get_text_quality, single characters (no words)", function()
+ local res = util.get_text_quality("A B C D E")
+ assert_equal(res.letters, 5)
+ assert_equal(res.words, 0) -- single letters don't count as words
+ assert_equal(res.word_chars, 0)
+ end)
+
+ test("get_text_quality, vowels vs consonants", function()
+ local res = util.get_text_quality("aeiou")
+ assert_equal(res.latin_vowels, 5)
+ assert_equal(res.latin_consonants, 0)
+
+ res = util.get_text_quality("bcdfg")
+ assert_equal(res.latin_vowels, 0)
+ assert_equal(res.latin_consonants, 5)
+ end)
+
+ test("get_text_quality, emojis", function()
+ local res = util.get_text_quality("Hello 👋 World")
+ assert_equal(res.emojis, 1)
+ assert_equal(res.words, 2) -- Hello and World
+ end)
+
+ test("get_text_quality, mixed content", function()
+ local res = util.get_text_quality("Test123! Hello...")
+ assert_equal(res.letters, 9) -- Test + Hello
+ assert_equal(res.digits, 3)
+ assert_equal(res.punctuation, 4) -- ! and ...
+ assert_equal(res.words, 2) -- Test and Hello
+ end)
+
for i,c in ipairs(cases) do
test("is_utf_mixed_script, test case #" .. i, function()
local actual = util.is_utf_mixed_script(c.input)