]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Further rework of lua urls extraction API
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 21 Apr 2020 15:07:40 +0000 (16:07 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 21 Apr 2020 15:07:40 +0000 (16:07 +0100)
src/lua/lua_task.c
src/lua/lua_url.c
src/lua/lua_url.h

index 5c7a8b0a427613ef01049ebda3be1d71a7e481b2..2ceb1c3c28b69b5426c0c07626f10f68eeb68d80 100644 (file)
@@ -2256,6 +2256,8 @@ lua_task_get_urls (lua_State * L)
        struct rspamd_task *task = lua_check_task (L, 1);
        struct lua_tree_cb_data cb;
        struct rspamd_url *u;
+       static const gint default_protocols_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS|
+                                                                                          PROTOCOL_FILE|PROTOCOL_FTP;
        gsize sz, max_urls = 0;
 
        if (task) {
@@ -2269,15 +2271,15 @@ lua_task_get_urls (lua_State * L)
                        return 1;
                }
 
-               if (!lua_url_cbdata_fill (L, 2, &cb)) {
+               /* Exclude RSPAMD_URL_FLAG_CONTENT to preserve backward compatibility */
+               if (!lua_url_cbdata_fill (L, 2, &cb, default_protocols_mask,
+                               (~RSPAMD_URL_FLAG_CONTENT), max_urls)) {
                        return luaL_error (L, "invalid arguments");
                }
 
-               memset (&cb, 0, sizeof (cb));
-
                sz = kh_size (MESSAGE_FIELD (task, urls));
                sz = lua_url_adjust_skip_prob (task->task_timestamp,
-                               MESSAGE_FIELD (task, digest), &cb, sz, max_urls);
+                               MESSAGE_FIELD (task, digest), &cb, sz);
 
                lua_createtable (L, sz, 0);
 
@@ -2425,20 +2427,26 @@ lua_task_get_emails (lua_State * L)
        struct rspamd_task *task = lua_check_task (L, 1);
        struct lua_tree_cb_data cb;
        struct rspamd_url *u;
+       gsize max_urls = 0, sz;
 
        if (task) {
                if (task->message) {
-                       lua_createtable (L, kh_size (MESSAGE_FIELD (task, urls)), 0);
-                       memset (&cb, 0, sizeof (cb));
-                       cb.i = 1;
-                       cb.L = L;
-                       cb.mask = PROTOCOL_MAILTO;
+                       if (!lua_url_cbdata_fill (L, 2, &cb, PROTOCOL_MAILTO,
+                                       (~RSPAMD_URL_FLAG_CONTENT), max_urls)) {
+                               return luaL_error (L, "invalid arguments");
+                       }
+
+                       sz = kh_size (MESSAGE_FIELD (task, urls));
+                       sz = lua_url_adjust_skip_prob (task->task_timestamp,
+                                       MESSAGE_FIELD (task, digest), &cb, sz);
+
+                       lua_createtable (L, sz, 0);
 
                        kh_foreach_key (MESSAGE_FIELD (task, urls), u, {
-                               if ((u->protocol & PROTOCOL_MAILTO)) {
-                                       lua_tree_url_callback (u, u, &cb);
-                               }
+                               lua_tree_url_callback (u, u, &cb);
                        });
+
+                       lua_url_cbdata_dtor (&cb);
                }
                else {
                        lua_newtable (L);
index 65f0569a560e2d141e3d487f2ca8d82d060ddf73..45f9ab6831f0467260c9784c97e1a2de7375c53a 100644 (file)
@@ -933,10 +933,7 @@ lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
        struct rspamd_url *url = (struct rspamd_url *)value;
        struct lua_tree_cb_data *cb = ud;
 
-       if (url->protocol & cb->mask) {
-               if (!cb->need_images && (url->flags & RSPAMD_URL_FLAG_IMAGE)) {
-                       return;
-               }
+       if ((url->protocol & cb->protocols_mask) && (url->flags & cb->flags_mask)) {
 
                if (cb->skip_prob > 0) {
                        gdouble coin = rspamd_random_double_fast_seed (cb->xoroshiro_state);
@@ -955,35 +952,126 @@ lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
 }
 
 gboolean
-lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd)
+lua_url_cbdata_fill (lua_State *L,
+                                        gint pos,
+                                        struct lua_tree_cb_data *cbd,
+                                        guint default_protocols,
+                                        guint default_flags,
+                                        gsize max_urls)
 {
-       gboolean need_images = FALSE;
        gint protocols_mask = 0;
-       static const gint default_mask = PROTOCOL_HTTP|PROTOCOL_HTTPS|
-                                                                        PROTOCOL_FILE|PROTOCOL_FTP;
+
        gint pos_arg_type = lua_type (L, pos);
+       guint flags_mask = default_flags;
 
        if (pos_arg_type == LUA_TBOOLEAN) {
-               protocols_mask = default_mask;
+               protocols_mask = default_protocols;
                if (lua_toboolean (L, 2)) {
                        protocols_mask |= PROTOCOL_MAILTO;
                }
        }
        else if (pos_arg_type == LUA_TTABLE) {
-               for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
-                       int nmask;
-                       const gchar *pname = lua_tostring (L, -1);
+               if (rspamd_lua_geti (L, 1, pos) == LUA_TNIL) {
+                       /* New method: indexed table */
+
+                       lua_getfield (L, pos, "flags");
+                       if (lua_istable (L, -1)) {
+                               for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
+                                       int nmask = 0;
+                                       const gchar *fname = lua_tostring (L, -1);
+
+
+                                       if (rspamd_url_flag_from_string (fname, &nmask)) {
+                                               flags_mask |= nmask;
+                                       }
+                                       else {
+                                               msg_info ("bad url flag: %s", fname);
+                                               return FALSE;
+                                       }
+                               }
+                       }
+                       else {
+                               flags_mask |= default_flags;
+                       }
+                       lua_pop (L, 1);
+
+                       lua_getfield (L, pos, "protocols");
+                       if (lua_istable (L, -1)) {
+                               for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
+                                       int nmask;
+                                       const gchar *pname = lua_tostring (L, -1);
+
+                                       nmask = rspamd_url_protocol_from_string (pname);
+
+                                       if (nmask != PROTOCOL_UNKNOWN) {
+                                               protocols_mask |= nmask;
+                                       }
+                                       else {
+                                               msg_info ("bad url protocol: %s", pname);
+                                               return FALSE;
+                                       }
+                               }
+                       }
+                       else {
+                               protocols_mask = default_protocols;
+                       }
+                       lua_pop (L, 1);
 
-                       nmask = rspamd_url_protocol_from_string (pname);
+                       lua_getfield (L, pos, "emails");
+                       if (lua_isboolean (L, -1)) {
+                               if (lua_toboolean (L, -1)) {
+                                       protocols_mask |= PROTOCOL_MAILTO;
+                               }
+                       }
+                       lua_pop (L, 1);
 
-                       if (nmask != PROTOCOL_UNKNOWN) {
-                               protocols_mask |= nmask;
+                       lua_getfield (L, pos, "images");
+                       if (lua_isboolean (L, -1)) {
+                               if (lua_toboolean (L, -1)) {
+                                       flags_mask |= RSPAMD_URL_FLAG_IMAGE;
+                               }
+                               else {
+                                       flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
+                               }
                        }
-                       else {
-                               msg_info ("bad url protocol: %s", pname);
-                               return FALSE;
+                       lua_pop (L, 1);
+
+                       lua_getfield (L, pos, "content");
+                       if (lua_isboolean (L, -1)) {
+                               if (lua_toboolean (L, -1)) {
+                                       flags_mask |= RSPAMD_URL_FLAG_CONTENT;
+                               }
+                               else {
+                                       flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
+                               }
                        }
+                       lua_pop (L, 1);
+
+                       lua_getfield (L, pos, "max_urls");
+                       if (lua_isnumber (L, -1)) {
+                               max_urls = lua_tonumber (L, -1);
+                       }
+                       lua_pop (L, 1);
                }
+               else {
+                       /* Plain table of the protocols */
+                       for (lua_pushnil (L); lua_next (L, pos); lua_pop (L, 1)) {
+                               int nmask;
+                               const gchar *pname = lua_tostring (L, -1);
+
+                               nmask = rspamd_url_protocol_from_string (pname);
+
+                               if (nmask != PROTOCOL_UNKNOWN) {
+                                       protocols_mask |= nmask;
+                               }
+                               else {
+                                       msg_info ("bad url protocol: %s", pname);
+                                       return FALSE;
+                               }
+                       }
+               }
+
+               lua_pop (L, 1); /* After rspamd_lua_geti */
        }
        else if (pos_arg_type == LUA_TSTRING) {
                const gchar *plist = lua_tostring (L, pos);
@@ -1012,22 +1100,29 @@ lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd)
                g_strfreev (strvec);
        }
        else if (pos_arg_type == LUA_TNONE || pos_arg_type == LUA_TNIL) {
-               protocols_mask = default_mask;
+               protocols_mask = default_protocols;
+               flags_mask = default_flags;
        }
        else {
                return FALSE;
        }
 
        if (lua_type (L, pos + 1) == LUA_TBOOLEAN) {
-               need_images = lua_toboolean (L, pos + 1);
+               if (lua_toboolean (L, pos + 1)) {
+                       flags_mask |= RSPAMD_URL_FLAG_IMAGE;
+               }
+               else {
+                       flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
+               }
        }
 
        memset (cbd, 0, sizeof (*cbd));
 
        cbd->i = 1;
        cbd->L = L;
-       cbd->mask = protocols_mask;
-       cbd->need_images = need_images;
+       cbd->max_urls = max_urls;
+       cbd->protocols_mask = protocols_mask;
+       cbd->flags_mask = flags_mask;
 
        /* This needs to be removed from the stack */
        rspamd_lua_class_metatable (L, "rspamd{url}");
@@ -1049,11 +1144,10 @@ gsize
 lua_url_adjust_skip_prob (gdouble timestamp,
                                                  guchar *digest,
                                                  struct lua_tree_cb_data *cb,
-                                                 gsize sz,
-                                                 gsize max_urls)
+                                                 gsize sz)
 {
-       if (max_urls > 0 && sz > max_urls) {
-               cb->skip_prob = 1.0 - ((gdouble)max_urls) / (gdouble)sz;
+       if (cb->max_urls > 0 && sz > cb->max_urls) {
+               cb->skip_prob = 1.0 - ((gdouble)cb->max_urls) / (gdouble)sz;
                /*
                 * Use task dependent probabilistic seed to ensure that
                 * consequent task:get_urls return the same list of urls
@@ -1062,7 +1156,7 @@ lua_url_adjust_skip_prob (gdouble timestamp,
                                MIN (sizeof (cb->xoroshiro_state[0]), sizeof (timestamp)));
                memcpy (&cb->xoroshiro_state[1], digest,
                                sizeof (cb->xoroshiro_state[1]) * 3);
-               sz = max_urls;
+               sz = cb->max_urls;
        }
 
        return sz;
index 57d20f9203d1d3b4f342e1b735331899b750c462..0ea2186d86ca6650ae1be261f2e0a7e31eea9f3b 100644 (file)
@@ -26,8 +26,9 @@ struct lua_tree_cb_data {
        lua_State *L;
        int i;
        int metatable_pos;
-       gint mask;
-       gint need_images;
+       guint flags_mask;
+       guint protocols_mask;
+       gsize max_urls;
        gdouble skip_prob;
        guint64 xoroshiro_state[4];
 };
@@ -41,7 +42,11 @@ void lua_tree_url_callback (gpointer key, gpointer value, gpointer ud);
  * @param cbd
  * @return
  */
-gboolean lua_url_cbdata_fill (lua_State *L, gint pos, struct lua_tree_cb_data *cbd);
+gboolean lua_url_cbdata_fill (lua_State *L, gint pos,
+                                                         struct lua_tree_cb_data *cbd,
+                                                         guint default_protocols,
+                                                         guint default_flags,
+                                                         gsize max_urls);
 
 /**
  * Cleanup url cbdata
@@ -61,8 +66,7 @@ void lua_url_cbdata_dtor (struct lua_tree_cb_data *cbd);
 gsize lua_url_adjust_skip_prob (gdouble timestamp,
                                                                guchar *digest,
                                                                struct lua_tree_cb_data *cb,
-                                                               gsize sz,
-                                                               gsize max_urls);
+                                                               gsize sz);
 
 #ifdef  __cplusplus
 }