#endif
/* Ugly workaround */
- tmp = rspamd_tokenize_text (part->content->data,
+ part->normalized_words = rspamd_tokenize_text (part->content->data,
part->content->len, IS_PART_UTF (part), task->cfg,
part->urls_offset, FALSE,
NULL);
}
}
}
- part->normalized_words = tmp;
}
#ifdef WITH_SNOWBALL
if (stem != NULL) {
/* Post process part */
detect_text_language (text_part);
- text_part->words = rspamd_tokenize_text (text_part->content->data,
- text_part->content->len, IS_PART_UTF (text_part), task->cfg,
- text_part->urls_offset, FALSE,
- &text_part->hash);
rspamd_normalize_text_part (task, text_part);
/* Calculate number of lines */
GList *urls_offset; /**< list of offsets of urls */
GMimeObject *parent;
struct mime_part *mime_part;
- GArray *words;
GArray *normalized_words;
guint nlines;
guint64 hash;
for (i = 0; i < task->text_parts->len; i ++) {
tp = g_ptr_array_index (task->text_parts, i);
- if (tp->words) {
- g_array_free (tp->words, TRUE);
- }
if (tp->normalized_words) {
g_array_free (tp->normalized_words, TRUE);
}
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
- if (part->words != NULL) {
- for (j = 0; j < part->words->len; j ++) {
- word = &g_array_index (part->words, rspamd_ftok_t, j);
+ if (part->normalized_words != NULL) {
+ for (j = 0; j < part->normalized_words->len; j ++) {
+ word = &g_array_index (part->normalized_words, rspamd_ftok_t, j);
rspamd_cryptobox_hash_update (&st, word->begin, word->len);
}
}
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
- if (!IS_PART_EMPTY (part) && part->words != NULL) {
- if (compat) {
- tok->tokenizer->tokenize_func (tok, task->task_pool,
- part->words, IS_PART_UTF (part), NULL);
- }
- else {
- tok->tokenizer->tokenize_func (tok, task->task_pool,
+ if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+ tok->tokenizer->tokenize_func (tok, task->task_pool,
part->normalized_words, IS_PART_UTF (part), NULL);
- }
}
+
if (pdiff != NULL && *pdiff > similarity_treshold) {
msg_debug_task ("message has two common parts (%d%%), so skip the last one",
*pdiff);
static GArray *
fuzzy_preprocess_words (struct mime_text_part *part, rspamd_mempool_t *pool)
{
- GArray *res;
-
- if (!IS_PART_UTF (part) || !part->language || part->language[0] == '\0' ||
- part->normalized_words == NULL) {
- res = part->words;
- }
- else {
- res = part->normalized_words;
- }
-
- return res;
+ return part->normalized_words;
}
/*
continue;
}
- if (part->words == NULL || part->words->len == 0) {
+ if (part->normalized_words == NULL || part->normalized_words->len == 0) {
msg_info_task ("<%s>, part hash empty, skip fuzzy check",
task->message_id);
continue;
}
if (fuzzy_module_ctx->min_hash_len != 0 &&
- part->words->len < fuzzy_module_ctx->min_hash_len) {
+ part->normalized_words->len < fuzzy_module_ctx->min_hash_len) {
msg_info_task (
"<%s>, part hash is shorter than %d symbols, skip fuzzy check",
task->message_id,