[Feature] Add more text attributes

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 23 Sep 2017 12:44:57 +0000 (13:44 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 23 Sep 2017 12:44:57 +0000 (13:44 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 23 Sep 2017 12:44:57 +0000 (13:44 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 23 Sep 2017 12:44:57 +0000 (13:44 +0100)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index a22f51912963d6b8ad379cac315bed8d62f5f9f8..ce53c15f9b6c0ff019f09a977c2986a9641498fd 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -202,6 +202,7 @@ rspamd_extract_words (struct rspamd_task *task,
         gchar *temp_word;
         const guchar *r;
         guint i, nlen, total_len = 0, short_len = 0;
+       gdouble avg_len = 0;
  
  #ifdef WITH_SNOWBALL
         static GHashTable *stemmers = NULL;
@@ -252,6 +253,8 @@ rspamd_extract_words (struct rspamd_task *task,
  #endif
  
                         if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
+                               avg_len = avg_len + (w->len - avg_len) / (double)i;
+
                                 if (r != NULL) {
                                         nlen = strlen (r);
                                         nlen = MIN (nlen, w->len);
@@ -462,6 +465,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
                                                 part->non_ascii_chars ++;
                                         }
                                         else {
+                                               if (g_ascii_isupper (*p)) {
+                                                       part->capital_letters ++;
+                                               }
+                                               else if (g_ascii_isdigit (*p)) {
+                                                       part->numeric_characters ++;
+                                               }
+
                                                 part->ascii_chars ++;
                                         }
                                 }
diff --git a/src/libmime/message.h b/src/libmime/message.h

index 8dc06eb3a1271d364b8922eb1136dcbc96f13c42..3092f3da580f2be1e656f972356c819fe803415d 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -101,6 +101,8 @@ struct rspamd_mime_text_part {
         guint double_spaces;
         guint non_spaces;
         guint empty_lines;
+       guint capital_letters;
+       guint numeric_characters;
  };
  
  enum rspamd_received_type {
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c

index 3391fa996e437c76f732d6ed1adb906daa487dec..95145ac9c3542f02f63becb491faeb566be95b87 100644 (file)
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -170,7 +170,8 @@ chartable_module_reconfig (struct rspamd_config *cfg)
  static gdouble
  rspamd_chartable_process_word_utf (struct rspamd_task *task,
                 rspamd_stat_token_t *w,
-               gboolean is_url)
+               gboolean is_url,
+               guint *ncap)
  {
         const gchar *p, *end;
         gdouble badness = 0.0;
@@ -208,6 +209,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                                 sc = UBLOCK_BASIC_LATIN;
                         }
  
+                       if (sc != UBLOCK_BASIC_LATIN && u_isupper (uc)) {
+                               if (ncap) {
+                                       (*ncap) ++;
+                               }
+                       }
+
                         if (state == got_digit) {
                                 /* Penalize digit -> alpha translations */
                                 if (!is_url && sc != UBLOCK_BASIC_LATIN &&
@@ -363,7 +370,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
                 struct rspamd_mime_text_part *part)
  {
         rspamd_stat_token_t *w;
-       guint i;
+       guint i, ncap = 0;
         gdouble cur_score = 0.0;
  
         if (part == NULL || part->normalized_words == NULL ||
@@ -377,7 +384,8 @@ rspamd_chartable_process_part (struct rspamd_task *task,
                 if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
  
                         if (IS_PART_UTF (part)) {
-                               cur_score += rspamd_chartable_process_word_utf (task, w, FALSE);
+                               cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
+                                               &ncap);
                         }
                         else {
                                 cur_score += rspamd_chartable_process_word_ascii (task, w, FALSE);
@@ -385,6 +393,13 @@ rspamd_chartable_process_part (struct rspamd_task *task,
                 }
         }
  
+       /*
+        * TODO: perhaps, we should do this analysis somewhere else and get
+        * something like: <SYM_SC><SYM_SC><SYM_SC> representing classes for all
+        * symbols in the text
+        */
+       part->capital_letters += ncap;
+
         cur_score /= (gdouble)part->normalized_words->len;
  
         if (cur_score > 2.0) {
@@ -425,7 +440,8 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
                 if (words && words->len > 0) {
                         for (i = 0; i < words->len; i++) {
                                 w = &g_array_index (words, rspamd_stat_token_t, i);
-                               cur_score += rspamd_chartable_process_word_utf (task, w, FALSE);
+                               cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
+                                               NULL);
                         }
  
                         cur_score /= (gdouble)words->len;
@@ -471,7 +487,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused)
                         w.len = u->hostlen;
  
                         if (g_utf8_validate (w.begin, w.len, NULL)) {
-                               cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE);
+                               cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL);
                         }
                         else {
                                 cur_score += rspamd_chartable_process_word_ascii (task, &w, TRUE);
@@ -494,7 +510,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused)
                         w.len = u->hostlen;
  
                         if (g_utf8_validate (w.begin, w.len, NULL)) {
-                               cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE);
+                               cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL);
                         }
                         else {
                                 cur_score += rspamd_chartable_process_word_ascii (task, &w, TRUE);
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 23 Sep 2017 12:44:57 +0000 (13:44 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 23 Sep 2017 12:44:57 +0000 (13:44 +0100)
src/libmime/message.c		patch \| blob \| blame \| history
src/libmime/message.h		patch \| blob \| blame \| history
src/plugins/chartable.c		patch \| blob \| blame \| history