max_pdf_objects = 10000, -- Maximum number of objects to be considered
max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
- -- Timeout for PDF processing in seconds. If exceeded, text extraction is skipped
- -- as partial results would be incorrect. Can be overridden via
- -- pdf.pdf_process_timeout in configuration.
- pdf_process_timeout = 2.0,
- -- Text quality filtering options for garbage detection
- text_quality_threshold = 0.4, -- Minimum confidence score (0.0-1.0) to accept extracted text
+ pdf_process_timeout = 2.0, -- Timeout in seconds for processing
+ text_quality_threshold = 0.4, -- Minimum confidence to accept extracted text
text_quality_min_length = 10, -- Minimum text length to apply quality filtering
- text_quality_enabled = true, -- Enable/disable text quality filtering
+ text_quality_enabled = true, -- Enable text quality filtering
}
-- Used to process patterns found in PDF
end
if res then
- -- Ligature fix for StandardEncoding (common in simple PDFs)
- -- 0xAB (171) -> ff
- -- 0xAC (172) -> ffi
- -- 0xAD (173) -> ffl
- -- 0xAE (174) -> fi
- -- 0xAF (175) -> fl
- -- MacRomanEncoding
- -- 0xDE (222) -> fi
- -- 0xDF (223) -> fl
+ -- StandardEncoding/MacRomanEncoding ligature substitutions
res = res:gsub('\171', 'ff')
res = res:gsub('\172', 'ffi')
res = res:gsub('\173', 'ffl')
end
s = s:gsub('\\%d%d?%d?', ue_octal)
- -- Ligature fix for StandardEncoding (common in simple PDFs)
- -- 0xAB (171) -> ff
- -- 0xAC (172) -> ffi
- -- 0xAD (173) -> ffl
- -- 0xAE (174) -> fi
- -- 0xAF (175) -> fl
- -- MacRomanEncoding
- -- 0xDE (222) -> fi
- -- 0xDF (223) -> fl
+ -- StandardEncoding/MacRomanEncoding ligature substitutions
s = s:gsub('\171', 'ff')
s = s:gsub('\172', 'ffi')
s = s:gsub('\173', 'ffl')
int32_t i = 0;
UChar32 uc, prev_uc = 0;
UScriptCode prev_script = USCRIPT_INVALID_CODE;
+ UErrorCode uc_err = U_ZERO_ERROR;
/* Basic counts */
int letters = 0;
non_ascii_chars++;
}
- /* Check for emoji */
- if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) {
- emojis++;
- printable++;
- /* Emojis break words */
- if (in_word && current_word_len >= 2) {
- words++;
- word_chars += current_word_len;
- }
- current_word_len = 0;
- in_word = FALSE;
- prev_was_space = FALSE;
- prev_script = USCRIPT_INVALID_CODE;
- continue;
- }
-
/* Check if it's a letter (any Unicode script) */
if (u_isalpha(uc)) {
letters++;
}
/* Latin vowel/consonant detection */
- UScriptCode script = uscript_getScript(uc, NULL);
+ uc_err = U_ZERO_ERROR;
+ UScriptCode script = uscript_getScript(uc, &uc_err);
if (script == USCRIPT_LATIN) {
if (is_latin_vowel(uc)) {
latin_vowels++;
in_word = FALSE;
prev_was_space = FALSE;
}
+ else if (u_hasBinaryProperty(uc, UCHAR_EMOJI)) {
+ /* Check for emoji (after digits/letters since 0-9 have UCHAR_EMOJI property) */
+ emojis++;
+ printable++;
+ /* Emojis break words */
+ if (in_word && current_word_len >= 2) {
+ words++;
+ word_chars += current_word_len;
+ }
+ current_word_len = 0;
+ in_word = FALSE;
+ prev_was_space = FALSE;
+ prev_script = USCRIPT_INVALID_CODE;
+ }
else if (u_isgraph(uc)) {
/* Other printable characters (symbols, etc.) */
printable++;
word_chars += current_word_len;
}
- /* Suppress unused variable warning */
(void) prev_uc;
- /* Build result table with all metrics */
lua_createtable(L, 0, 18);
lua_pushstring(L, "letters");