]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add method task:lookup_words
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 5 Dec 2018 18:06:12 +0000 (18:06 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 5 Dec 2018 18:06:12 +0000 (18:06 +0000)
src/lua/lua_common.c
src/lua/lua_common.h
src/lua/lua_task.c

index 7bb45f3477b0dea9fce13f4f7e7c0964597452b5..01d5dc869dfb6c62bbd274bcfc5129247d146744 100644 (file)
@@ -2408,12 +2408,90 @@ rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
        return FALSE;
 }
 
+void
+rspamd_lua_push_full_word (lua_State *L, rspamd_stat_token_t *w)
+{
+       gint fl_cnt;
+
+       lua_createtable (L, 4, 0);
+
+       if (w->stemmed.len > 0) {
+               lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
+               lua_rawseti (L, -2, 1);
+       }
+       else {
+               lua_pushstring (L, "");
+               lua_rawseti (L, -2, 1);
+       }
+
+       if (w->normalized.len > 0) {
+               lua_pushlstring (L, w->normalized.begin, w->normalized.len);
+               lua_rawseti (L, -2, 2);
+       }
+       else {
+               lua_pushstring (L, "");
+               lua_rawseti (L, -2, 2);
+       }
+
+       if (w->original.len > 0) {
+               lua_pushlstring (L, w->original.begin, w->original.len);
+               lua_rawseti (L, -2, 3);
+       }
+       else {
+               lua_pushstring (L, "");
+               lua_rawseti (L, -2, 3);
+       }
+
+       /* Flags part */
+       fl_cnt = 1;
+       lua_createtable (L, 4, 0);
+
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) {
+               lua_pushstring (L, "normalised");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) {
+               lua_pushstring (L, "broken_unicode");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+               lua_pushstring (L, "utf");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+               lua_pushstring (L, "text");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
+               lua_pushstring (L, "header");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) {
+               lua_pushstring (L, "meta");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
+               lua_pushstring (L, "stop_word");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) {
+               lua_pushstring (L, "invisible_spaces");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
+               lua_pushstring (L, "stemmed");
+               lua_rawseti (L, -2, fl_cnt ++);
+       }
+
+       lua_rawseti (L, -2, 4);
+}
+
 gint
 rspamd_lua_push_words (lua_State *L, GArray *words,
                                                        enum rspamd_lua_words_type how)
 {
        rspamd_stat_token_t *w;
-       guint i, cnt, fl_cnt;
+       guint i, cnt;
 
        lua_createtable (L, words->len, 0);
 
@@ -2440,78 +2518,7 @@ rspamd_lua_push_words (lua_State *L, GArray *words,
                        }
                        break;
                case RSPAMD_LUA_WORDS_FULL:
-                       lua_createtable (L, 4, 0);
-
-                       if (w->stemmed.len > 0) {
-                               lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
-                               lua_rawseti (L, -2, 1);
-                       }
-                       else {
-                               lua_pushstring (L, "");
-                               lua_rawseti (L, -2, 1);
-                       }
-
-                       if (w->normalized.len > 0) {
-                               lua_pushlstring (L, w->normalized.begin, w->normalized.len);
-                               lua_rawseti (L, -2, 2);
-                       }
-                       else {
-                               lua_pushstring (L, "");
-                               lua_rawseti (L, -2, 2);
-                       }
-
-                       if (w->original.len > 0) {
-                               lua_pushlstring (L, w->original.begin, w->original.len);
-                               lua_rawseti (L, -2, 3);
-                       }
-                       else {
-                               lua_pushstring (L, "");
-                               lua_rawseti (L, -2, 3);
-                       }
-
-                       /* Flags part */
-                       fl_cnt = 1;
-                       lua_createtable (L, 4, 0);
-
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) {
-                               lua_pushstring (L, "normalised");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) {
-                               lua_pushstring (L, "broken_unicode");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
-                               lua_pushstring (L, "utf");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
-                               lua_pushstring (L, "text");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
-                               lua_pushstring (L, "header");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) {
-                               lua_pushstring (L, "meta");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
-                               lua_pushstring (L, "stop_word");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) {
-                               lua_pushstring (L, "invisible_spaces");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
-                               lua_pushstring (L, "stemmed");
-                               lua_rawseti (L, -2, fl_cnt ++);
-                       }
-
-                       lua_rawseti (L, -2, 4);
-
+                       rspamd_lua_push_full_word (L, w);
                        /* Push to the resulting vector */
                        lua_rawseti (L, -2, cnt ++);
                        break;
index 25f5b7ff4892368b89896161ca74c5a924dfbbd9..31d7f852b45cb20bbddd58ee6eaa1b7c09ef3598 100644 (file)
@@ -433,6 +433,13 @@ gboolean rspamd_lua_require_function (lua_State *L, const gchar *modname,
 gboolean rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
                struct rspamd_config *cfg, gint *ref_id);
 
+struct rspamd_stat_token_s;
+/**
+ * Pushes a single word into Lua
+ * @param L
+ * @param word
+ */
+void rspamd_lua_push_full_word (lua_State *L, struct rspamd_stat_token_s *word);
 
 enum rspamd_lua_words_type {
        RSPAMD_LUA_WORDS_STEM = 0,
index 6f4923dc8c99cfa2868b85811b5f33b0c068cc74..a8a53f517f4108b9462322bec3d5bb1fbcfed2d3 100644 (file)
 #include "unix-std.h"
 #include "libmime/smtp_parsers.h"
 #include "libserver/mempool_vars_internal.h"
+#include "libserver/dkim.h"
 #include "libserver/task.h"
 #include "libstat/stat_api.h"
+#include "libutil/map_helpers.h"
+
 #include <math.h>
-#include <src/libserver/task.h>
-#include <src/libserver/dkim.h>
 
 /***
  * @module rspamd_task
@@ -958,6 +959,17 @@ LUA_FUNCTION_DEF (task, get_newlines_type);
  */
 LUA_FUNCTION_DEF (task, get_stat_tokens);
 
+/***
+ * @method task:lookup_words(map, function({o, n, s, f}) ... end)
+ * Matches words in a task (including meta words) against some map (set, regexp and so on)
+ * and call the specified function with a table containing 4 values:
+ *   - [1] - stemmed word
+ *   - [2] - normalised word
+ *   - [3] - raw word
+ *   - [4] - flags (table of strings)
+ */
+LUA_FUNCTION_DEF (task, lookup_words);
+
 static const struct luaL_reg tasklib_f[] = {
        LUA_INTERFACE_DEF (task, load_from_file),
        LUA_INTERFACE_DEF (task, load_from_string),
@@ -1060,6 +1072,7 @@ static const struct luaL_reg tasklib_m[] = {
        LUA_INTERFACE_DEF (task, get_newlines_type),
        LUA_INTERFACE_DEF (task, get_stat_tokens),
        LUA_INTERFACE_DEF (task, get_meta_words),
+       LUA_INTERFACE_DEF (task, lookup_words),
        {"__tostring", rspamd_lua_class_tostring},
        {NULL, NULL}
 };
@@ -5171,6 +5184,111 @@ lua_task_get_meta_words (lua_State *L)
        return 1;
 }
 
+static guint
+lua_lookup_words_array (lua_State *L,
+                                               gint cbpos,
+                                               struct rspamd_task *task,
+                                               struct rspamd_lua_map *map,
+                                               GArray *words)
+{
+       rspamd_stat_token_t *tok;
+       guint i, nmatched = 0;
+       gint err_idx;
+       gboolean matched;
+       const gchar *key;
+       gsize keylen;
+
+       for (i = 0; i < words->len; i ++) {
+               tok = &g_array_index (words, rspamd_stat_token_t, i);
+
+               matched = FALSE;
+
+               if (tok->normalized.len == 0) {
+                       continue;
+               }
+
+               key = tok->normalized.begin;
+               keylen = tok->normalized.len;
+
+               switch (map->type) {
+               case RSPAMD_LUA_MAP_SET:
+               case RSPAMD_LUA_MAP_HASH:
+                       /* We know that tok->normalized is zero terminated in fact */
+                       if (rspamd_match_hash_map (map->data.hash, key)) {
+                               matched = TRUE;
+                       }
+                       break;
+               case RSPAMD_LUA_MAP_REGEXP:
+               case RSPAMD_LUA_MAP_REGEXP_MULTIPLE:
+                       if (rspamd_match_regexp_map_single (map->data.re_map, key,
+                                       keylen)) {
+                               matched = TRUE;
+                       }
+                       break;
+               default:
+                       g_assert_not_reached ();
+                       break;
+               }
+
+               if (matched) {
+                       nmatched ++;
+
+                       lua_pushcfunction (L, &rspamd_lua_traceback);
+                       err_idx = lua_gettop (L);
+                       lua_pushvalue (L, cbpos); /* Function */
+                       rspamd_lua_push_full_word (L, tok);
+
+                       if (lua_pcall (L, 1, 0, err_idx) != 0) {
+                               GString *tb = lua_touserdata (L, -1);
+                               msg_err_task ("cannot call callback function for lookup words: %s",
+                                               tb->str);
+                               g_string_free (tb, TRUE);
+                       }
+
+                       lua_settop (L, err_idx - 1);
+               }
+       }
+
+       return nmatched;
+}
+
+static gint
+lua_task_lookup_words (lua_State *L)
+{
+       LUA_TRACE_POINT;
+       struct rspamd_task *task = lua_check_task (L, 1);
+       struct rspamd_lua_map *map = lua_check_map (L, 2);
+       struct rspamd_mime_text_part *tp;
+
+       guint i, matches = 0;
+
+       if (task == NULL || map == NULL || lua_type (L, 3) != LUA_TFUNCTION) {
+               return luaL_error (L, "invalid arguments");
+       }
+
+       if (map->type != RSPAMD_LUA_MAP_SET &&
+               map->type != RSPAMD_LUA_MAP_REGEXP &&
+               map->type != RSPAMD_LUA_MAP_HASH &&
+               map->type != RSPAMD_LUA_MAP_REGEXP_MULTIPLE) {
+               return luaL_error (L, "invalid map type");
+       }
+
+       PTR_ARRAY_FOREACH (task->text_parts, i, tp) {
+               if (tp->utf_words) {
+                       matches += lua_lookup_words_array (L, 3, task, map, tp->utf_words);
+               }
+       }
+
+       if (task->meta_words) {
+               matches += lua_lookup_words_array (L, 3, task, map, task->meta_words);
+       }
+
+       lua_pushinteger (L, matches);
+
+       return 1;
+}
+
+
 /* Image functions */
 static gint
 lua_image_get_width (lua_State *L)