From: Vsevolod Stakhov Date: Tue, 2 Jun 2026 18:04:34 +0000 (+0100) Subject: [Feature] protocol: Expose custom metadata for /checkv3 X-Git-Tag: 4.1.0~9^2 X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=ecd173f5a4806b10bb64bb107d37daad4769ba8b;p=thirdparty%2Frspamd.git [Feature] protocol: Expose custom metadata for /checkv3 Add two complementary ways to read custom fields sent with a /checkv3 multipart scan request, both free of the 80KB HTTP header limit that v2 hits, since the metadata travels in the multipart body: * A "headers" sub-object in the metadata part is injected into the task request headers, so task:get_request_header() works for custom fields exactly like v2 HTTP request headers. Reserved control-header names (shm/file/path/dictionary/Content-Encoding...) are skipped so client metadata cannot collide with the message-loading channel, and a repeated name (collapsed by UCL into an array) expands to a multi-valued request header. * The parsed metadata object is kept on task->meta and exposed to Lua via task:get_metadata() and task:get_metadata_field(key), mirroring get_settings()/lookup_settings(). The task now owns the object and frees it once in rspamd_task_free instead of via a pool destructor. rspamc gains a repeatable --metadata-header KEY=VALUE option that builds the metadata "headers" sub-object for v3 requests. Also drop a dead is_msgpack variable in the v3 request handler. Tests: functional cases in 430_checkv3.robot plus a checkv3_meta.lua plugin exercising both options via raw multipart and rspamc. --- diff --git a/src/client/rspamc.cxx b/src/client/rspamc.cxx index b7cffdf4ae..13fd507f30 100644 --- a/src/client/rspamc.cxx +++ b/src/client/rspamc.cxx @@ -65,6 +65,7 @@ static const char *local_addr = nullptr; static const char *execute = nullptr; static const char *sort = nullptr; static const char **http_headers = nullptr; +static const char **metadata_headers = nullptr; static const char **exclude_patterns = nullptr; static int weight = 0; static int flag = 0; @@ -177,6 +178,8 @@ static GOptionEntry entries[] = "Write mime body of message with headers instead of just a scan's result", nullptr}, {"header", 0, 0, G_OPTION_ARG_STRING_ARRAY, &http_headers, "Add custom HTTP header to query (can be repeated)", nullptr}, + {"metadata-header", 0, 0, G_OPTION_ARG_STRING_ARRAY, &metadata_headers, + "Add custom field to v3 metadata headers as KEY=VALUE or KEY:VALUE (can be repeated)", nullptr}, {"exclude", 0, 0, G_OPTION_ARG_STRING_ARRAY, &exclude_patterns, "Exclude specific glob patterns in file names (can be repeated)", nullptr}, {"sort", 0, 0, G_OPTION_ARG_STRING, &sort, @@ -2380,6 +2383,46 @@ rspamc_process_input(struct ev_loop *ev_base, const struct rspamc_command &cmd, ucl_object_unref(flags_arr); } + /* + * Custom metadata headers: carried in the metadata body part and + * exposed server-side via task:get_request_header(), free of the + * HTTP header size limit. + */ + if (metadata_headers) { + ucl_object_t *hdrs_obj = ucl_object_typed_new(UCL_OBJECT); + unsigned int nhdrs = 0; + + for (auto *mhdr = metadata_headers; *mhdr; mhdr++) { + std::string_view hdr_view{*mhdr}; + auto delim_pos = std::find_if(std::begin(hdr_view), std::end(hdr_view), + [](auto c) { return c == ':' || c == '='; }); + std::string key, val; + + if (delim_pos == std::end(hdr_view)) { + key = std::string{hdr_view}; + } + else { + auto off = std::distance(std::begin(hdr_view), delim_pos); + key = std::string{hdr_view.substr(0, off)}; + val = std::string{hdr_view.substr(off + 1)}; + } + + if (!key.empty()) { + ucl_object_insert_key(hdrs_obj, + ucl_object_fromstring(val.c_str()), + key.c_str(), 0, true); + nhdrs++; + } + } + + if (nhdrs > 0) { + ucl_object_insert_key(metadata, hdrs_obj, "headers", 0, false); + } + else { + ucl_object_unref(hdrs_obj); + } + } + rspamd_client_command_v3(conn, "checkv3", metadata, in, rspamc_client_cb, cbdata, compressed, msgpack_mode, diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index b795d9a2f6..896e965b85 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -2327,6 +2327,35 @@ void rspamd_protocol_write_log_pipe(struct rspamd_task *task) g_array_free(extra, TRUE); } +/* + * Inject a single metadata "headers" entry into the task request headers. + * The ftok structs point directly into the metadata UCL object (owned by + * task->meta for the whole task lifetime), so no copy of the bytes is needed. + * Lengths come from the UCL accessors, so embedded NULs (msgpack) are kept. + */ +static void +rspamd_protocol_metadata_add_header(struct rspamd_task *task, + const char *key, gsize klen, + const ucl_object_t *val_obj) +{ + gsize vlen; + const char *val = ucl_object_tolstring(val_obj, &vlen); + rspamd_ftok_t *name_tok, *val_tok; + + if (val == NULL) { + return; + } + + name_tok = rspamd_mempool_alloc(task->task_pool, sizeof(*name_tok)); + val_tok = rspamd_mempool_alloc(task->task_pool, sizeof(*val_tok)); + name_tok->begin = key; + name_tok->len = klen; + val_tok->begin = val; + val_tok->len = vlen; + + rspamd_task_add_request_header(task, name_tok, val_tok); +} + /* * Handle metadata from a parsed UCL object for v3 protocol. * Maps structured metadata fields to task fields. @@ -2582,6 +2611,72 @@ rspamd_protocol_handle_metadata(struct rspamd_task *task, } } + /* + * headers (object: header-name -> string value, or array of strings when a + * name is repeated) + * + * Custom fields carried in the metadata body part are exposed as task + * request headers, so they are retrievable via task:get_request_header() + * exactly like v2 HTTP request headers - but without the HTTP header size + * limit, since the metadata travels in the multipart body. + * + * NB: task->request_headers is also the control channel that + * rspamd_task_load_message consults for message-loading directives + * (shm/file/path/dictionary/Content-Encoding...). Those reserved names are + * skipped here so client-supplied metadata can never collide with them. + */ + elt = ucl_object_lookup(metadata, "headers"); + if (elt && ucl_object_type(elt) == UCL_OBJECT) { + static const char *reserved_hdrs[] = { + "shm", "shm-offset", "shm-length", "file", "path", + "dictionary", "compression", "content-encoding"}; + ucl_object_iter_t it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + gsize klen; + const char *key = ucl_object_keyl(cur, &klen); + gboolean reserved = FALSE; + unsigned int i; + + if (key == NULL || klen == 0) { + continue; + } + + for (i = 0; i < G_N_ELEMENTS(reserved_hdrs); i++) { + if (strlen(reserved_hdrs[i]) == klen && + rspamd_lc_cmp(key, reserved_hdrs[i], klen) == 0) { + reserved = TRUE; + break; + } + } + + if (reserved) { + msg_info_protocol("ignore reserved metadata header '%*s'", + (int) klen, key); + continue; + } + + if (ucl_object_type(cur) == UCL_STRING) { + rspamd_protocol_metadata_add_header(task, key, klen, cur); + } + else if (ucl_object_type(cur) == UCL_ARRAY) { + /* + * A repeated header name is collapsed by the UCL parser into an + * array under that key; expand each string value into its own + * request header (request headers are multi-valued). + */ + ucl_object_iter_t ait = NULL; + const ucl_object_t *aval; + + while ((aval = ucl_object_iterate(cur, &ait, true)) != NULL) { + if (ucl_object_type(aval) == UCL_STRING) { + rspamd_protocol_metadata_add_header(task, key, klen, aval); + } + } + } + } + } + return TRUE; } @@ -2854,9 +2949,12 @@ rspamd_protocol_handle_v3_request(struct rspamd_task *task, return FALSE; } - rspamd_mempool_add_destructor(task->task_pool, - (rspamd_mempool_destruct_t) ucl_object_unref, - metadata_obj); + /* + * The task takes ownership of the metadata object; it is unref'd in + * rspamd_task_free. Keeping it alive for the whole task lifetime also + * exposes it to Lua via task:get_metadata()/get_metadata_field(). + */ + task->meta = metadata_obj; /* Apply metadata to task */ if (!rspamd_protocol_handle_metadata(task, metadata_obj)) { diff --git a/src/libserver/task.c b/src/libserver/task.c index 2b5a443cf8..31953cf16a 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -297,6 +297,10 @@ void rspamd_task_free(struct rspamd_task *task) ucl_object_unref(task->settings); } + if (task->meta != NULL) { + ucl_object_unref(task->meta); + } + if (task->settings_elt != NULL) { REF_RELEASE(task->settings_elt); } diff --git a/src/libserver/task.h b/src/libserver/task.h index fe8cbf9012..78cd9319ed 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -224,6 +224,7 @@ struct rspamd_task { const char *classifier; /**< Classifier to learn (if needed) */ struct rspamd_lang_detector *lang_det; /**< Languages detector */ struct rspamd_message *message; + ucl_object_t *meta; /**< custom metadata object from a checkv3 request (or NULL) */ /* ESMTP arguments from milter protocol */ GHashTable *mail_esmtp_args; /**< ESMTP arguments from MAIL FROM command */ diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 7d70ff168f..44477f8fac 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -1061,6 +1061,22 @@ LUA_FUNCTION_DEF(task, get_settings); */ LUA_FUNCTION_DEF(task, lookup_settings); +/*** + * @method task:get_metadata() + * Gets the custom metadata object supplied with a /checkv3 multipart request. + * Returns nil for requests that carried no metadata part (e.g. /checkv2). + * @return {lua object|nil} lua object generated from the metadata UCL + */ +LUA_FUNCTION_DEF(task, get_metadata); + +/*** + * @method task:get_metadata_field(key) + * Gets a single top-level field from the /checkv3 metadata object. + * @param {string} key optional; if omitted the whole metadata object is returned (mirrors lookup_settings) + * @return {lua object|nil} lua object generated from the metadata field + */ +LUA_FUNCTION_DEF(task, get_metadata_field); + /*** * @method task:get_settings_id() * Get numeric hash of settings id if specified for this task. 0 is returned otherwise. @@ -1478,6 +1494,8 @@ static const struct luaL_reg tasklib_m[] = { LUA_INTERFACE_DEF(task, set_settings), LUA_INTERFACE_DEF(task, get_settings), LUA_INTERFACE_DEF(task, lookup_settings), + LUA_INTERFACE_DEF(task, get_metadata), + LUA_INTERFACE_DEF(task, get_metadata_field), LUA_INTERFACE_DEF(task, get_settings_id), LUA_INTERFACE_DEF(task, set_settings_id), LUA_INTERFACE_DEF(task, merge_and_apply_settings), @@ -6855,6 +6873,68 @@ lua_task_lookup_settings(lua_State *L) return 1; } +static int +lua_task_get_metadata(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_task *task = lua_check_task(L, 1); + + if (task != NULL) { + + if (task->meta) { + return ucl_object_push_lua(L, task->meta, true); + } + else { + lua_pushnil(L); + } + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 1; +} + +static int +lua_task_get_metadata_field(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_task *task = lua_check_task(L, 1); + const char *key = NULL; + const ucl_object_t *elt; + + if (task != NULL) { + + if (lua_isstring(L, 2)) { + key = lua_tostring(L, 2); + } + + if (task->meta) { + if (key == NULL) { + return ucl_object_push_lua(L, task->meta, true); + } + else { + elt = ucl_object_lookup(task->meta, key); + + if (elt) { + return ucl_object_push_lua(L, elt, true); + } + else { + lua_pushnil(L); + } + } + } + else { + lua_pushnil(L); + } + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 1; +} + static int lua_task_get_settings_id(lua_State *L) { diff --git a/test/functional/cases/001_merged/430_checkv3.robot b/test/functional/cases/001_merged/430_checkv3.robot index fc013c06a3..fc046f9b4d 100644 --- a/test/functional/cases/001_merged/430_checkv3.robot +++ b/test/functional/cases/001_merged/430_checkv3.robot @@ -88,3 +88,23 @@ checkv3 via rspamc encrypted with msgpack ${result} = Run Rspamc -p -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_NORMAL} --protocol-v3 ... --msgpack --key ${RSPAMD_KEY_PUB1} --settings=${SETTINGS_NOSYMBOLS} ${GTUBE} Check Rspamc ${result} GTUBE ( + +checkv3 custom metadata header via get_request_header + [Documentation] Custom field in the metadata "headers" sub-object is retrievable via task:get_request_header + &{V3_HDRS} = Create Dictionary X-V3-Custom=hello-from-meta + &{V3_META} = Create Dictionary headers=${V3_HDRS} + Scan File V3 ${MESSAGE} metadata=${V3_META} + Expect Symbol With Option TEST_V3_META_HEADER hello-from-meta + +checkv3 metadata fields via get_metadata and get_metadata_field + [Documentation] Arbitrary top-level metadata fields are readable via task:get_metadata()/get_metadata_field() + &{V3_META} = Create Dictionary custom_field=meta-value-42 + Scan File V3 ${MESSAGE} metadata=${V3_META} + Expect Symbol With Option TEST_V3_META_FIELD meta-value-42 + Expect Symbol With Option TEST_V3_META_FIELD_LOOKUP meta-value-42 + +checkv3 via rspamc with metadata-header + [Documentation] rspamc --metadata-header injects a metadata header retrievable via task:get_request_header + ${result} = Run Rspamc -p -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_NORMAL} --protocol-v3 + ... --metadata-header=X-V3-Custom=from-rspamc ${MESSAGE} + Check Rspamc ${result} TEST_V3_META_HEADER ( diff --git a/test/functional/configs/merged.conf b/test/functional/configs/merged.conf index 6e270432a5..4b03265876 100644 --- a/test/functional/configs/merged.conf +++ b/test/functional/configs/merged.conf @@ -43,6 +43,9 @@ lua = "{= env.TESTDIR =}/lua/external_relay.lua" # 109_composites_postfilter lua = "{= env.TESTDIR =}/lua/composites_postfilter.lua" +# 430_checkv3 custom metadata +lua = "{= env.TESTDIR =}/lua/checkv3_meta.lua" + .include(priority=1,duplicate=merge) "{= env.TESTDIR =}/configs/merged-local.conf" .include(priority=2,duplicate=replace) "{= env.TESTDIR =}/configs/merged-override.conf" diff --git a/test/functional/lua/checkv3_meta.lua b/test/functional/lua/checkv3_meta.lua new file mode 100644 index 0000000000..3e001db1aa --- /dev/null +++ b/test/functional/lua/checkv3_meta.lua @@ -0,0 +1,39 @@ +-- Symbols exercising the /checkv3 custom-metadata feature. +-- +-- Option A: a custom field carried in the metadata "headers" sub-object is +-- exposed as a task request header (task:get_request_header). +-- Option B: arbitrary metadata fields are readable via task:get_metadata() +-- and task:get_metadata_field(key). +-- +-- All callbacks are no-ops unless their specific field is present, so the +-- symbols stay inert for every other suite sharing the merged config. + +rspamd_config:register_symbol({ + name = 'TEST_V3_META_HEADER', + score = 1.0, + callback = function(task) + local h = task:get_request_header('X-V3-Custom') + if not h then return end + return true, tostring(h) + end +}) + +rspamd_config:register_symbol({ + name = 'TEST_V3_META_FIELD', + score = 1.0, + callback = function(task) + local meta = task:get_metadata() + if not meta or not meta.custom_field then return end + return true, tostring(meta.custom_field) + end +}) + +rspamd_config:register_symbol({ + name = 'TEST_V3_META_FIELD_LOOKUP', + score = 1.0, + callback = function(task) + local v = task:get_metadata_field('custom_field') + if not v then return end + return true, tostring(v) + end +})