From 1d95f1678637f30b6bf453f781b5938d64354228 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 12 Dec 2016 13:44:08 +0000 Subject: [PATCH] [Rework] Implement content type parser for mime --- src/CMakeLists.txt | 11 ++- src/libmime/CMakeLists.txt | 3 +- src/libmime/content_type.c | 79 ++++++++++++++++ src/libmime/content_type.h | 61 +++++++++++++ src/libmime/smtp_parsers.h | 4 + src/ragel/content_type.rl | 40 ++++++++ src/ragel/content_type_parser.rl | 152 +++++++++++++++++++++++++++++++ 7 files changed, 347 insertions(+), 3 deletions(-) create mode 100644 src/libmime/content_type.c create mode 100644 src/libmime/content_type.h create mode 100644 src/ragel/content_type.rl create mode 100644 src/ragel/content_type_parser.rl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 943a4dc9e6..02cf7e7f10 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -105,7 +105,8 @@ SET(RAGEL_DEPENDS "${CMAKE_SOURCE_DIR}/src/ragel/smtp_address.rl" "${CMAKE_SOURCE_DIR}/src/ragel/smtp_date.rl" "${CMAKE_SOURCE_DIR}/src/ragel/smtp_ip.rl" "${CMAKE_SOURCE_DIR}/src/ragel/smtp_whitespace.rl" - "${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl") + "${CMAKE_SOURCE_DIR}/src/ragel/smtp_received.rl" + "${CMAKE_SOURCE_DIR}/src/ragel/content_type.rl") RAGEL_TARGET(ragel_smtp_addr INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/smtp_addr_parser.rl DEPENDS ${RAGEL_DEPENDS} @@ -121,6 +122,11 @@ RAGEL_TARGET(ragel_newlines_strip DEPENDS ${RAGEL_DEPENDS} COMPILE_FLAGS -G2 OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/newlines_strip.rl.c) +RAGEL_TARGET(ragel_content_type + INPUTS ${CMAKE_SOURCE_DIR}/src/ragel/content_type_parser.rl + DEPENDS ${RAGEL_DEPENDS} + COMPILE_FLAGS -G2 + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/content_type.rl.c) ######################### LINK SECTION ############################### ADD_LIBRARY(rspamd-server STATIC @@ -134,7 +140,8 @@ ADD_LIBRARY(rspamd-server STATIC ${PLUGINSSRC} "${RAGEL_ragel_smtp_addr_OUTPUTS}" "${RAGEL_ragel_smtp_received_OUTPUTS}" - "${RAGEL_ragel_newlines_strip_OUTPUTS}") + "${RAGEL_ragel_newlines_strip_OUTPUTS}" + "${RAGEL_ragel_content_type_OUTPUTS}") TARGET_LINK_LIBRARIES(rspamd-server rspamd-http-parser) TARGET_LINK_LIBRARIES(rspamd-server rspamd-cdb) TARGET_LINK_LIBRARIES(rspamd-server rspamd-lpeg) diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt index a4485461a9..39bd2d4026 100644 --- a/src/libmime/CMakeLists.txt +++ b/src/libmime/CMakeLists.txt @@ -5,6 +5,7 @@ SET(LIBRSPAMDMIMESRC ${CMAKE_CURRENT_SOURCE_DIR}/filter.c ${CMAKE_CURRENT_SOURCE_DIR}/images.c ${CMAKE_CURRENT_SOURCE_DIR}/message.c - ${CMAKE_CURRENT_SOURCE_DIR}/archives.c) + ${CMAKE_CURRENT_SOURCE_DIR}/archives.c + ${CMAKE_CURRENT_SOURCE_DIR}/content_type.c) SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/libmime/content_type.c b/src/libmime/content_type.c new file mode 100644 index 0000000000..9161850c9f --- /dev/null +++ b/src/libmime/content_type.c @@ -0,0 +1,79 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "libmime/content_type.h" +#include "smtp_parsers.h" +#include "utlist.h" + +void +rspamd_content_type_add_param (rspamd_mempool_t *pool, + struct rspamd_content_type *ct, + const gchar *name_start, const gchar *name_end, + const gchar *value_start, const gchar *value_end) +{ + rspamd_ftok_t srch; + struct rspamd_content_type_param *found = NULL, *nparam; + + g_assert (ct != NULL); + + srch.begin = name_start; + srch.len = name_end - name_start; + + if (ct->attrs) { + found = g_hash_table_lookup (ct->attrs, &srch); + } + else { + ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash, + rspamd_ftok_icase_equal); + } + + nparam = rspamd_mempool_alloc (pool, sizeof (*nparam)); + nparam->name.begin = name_start; + nparam->name.len = name_end - name_start; + nparam->value.begin = value_start; + nparam->value.len = value_end - value_start; + DL_APPEND (found, nparam); + + if (!found) { + g_hash_table_insert (ct->attrs, &nparam->name, nparam); + } +} + +struct rspamd_content_type * +rspamd_content_type_parse (const gchar *in, + gsize len, rspamd_mempool_t *pool) +{ + struct rspamd_content_type *res = NULL, val; + + val.lc_data = rspamd_mempool_alloc (pool, len); + memcpy (val.lc_data, in, len); + rspamd_str_lc (val.lc_data, len); + + if (rspamd_content_type_parser (val.lc_data, len, &val, pool)) { + res = rspamd_mempool_alloc (pool, sizeof (val)); + memcpy (res, &val, sizeof (val)); + + if (res->attrs) { + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs); + } + } + else { + msg_warn_pool ("cannot parse content type: %*s", (gint)len, val.lc_data); + } + + return res; +} diff --git a/src/libmime/content_type.h b/src/libmime/content_type.h new file mode 100644 index 0000000000..7c129936db --- /dev/null +++ b/src/libmime/content_type.h @@ -0,0 +1,61 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBMIME_CONTENT_TYPE_H_ +#define SRC_LIBMIME_CONTENT_TYPE_H_ + +#include "config.h" +#include "libutil/fstring.h" +#include "libutil/mem_pool.h" + +struct rspamd_content_type_param { + rspamd_ftok_t name; + rspamd_ftok_t value; + struct rspamd_content_type_param *prev, *next; +}; + +struct rspamd_content_type { + gchar *lc_data; + rspamd_ftok_t type; + rspamd_ftok_t subtype; + rspamd_ftok_t charset; + GHashTable *attrs; /* Can be empty */ +}; + +/** + * Adds new parameter to content type structure + * @param ct + * @param name_start + * @param name_end + * @param value_start + * @param value_end + */ +void +rspamd_content_type_add_param (rspamd_mempool_t *pool, + struct rspamd_content_type *ct, + const gchar *name_start, const gchar *name_end, + const gchar *value_start, const gchar *value_end); + +/** + * Parse content type from the header (performs copy + lowercase) + * @param in + * @param len + * @param pool + * @return + */ +struct rspamd_content_type * rspamd_content_type_parse (const gchar *in, + gsize len, rspamd_mempool_t *pool); + +#endif /* SRC_LIBMIME_CONTENT_TYPE_H_ */ diff --git a/src/libmime/smtp_parsers.h b/src/libmime/smtp_parsers.h index 07bd246885..0d6e234131 100644 --- a/src/libmime/smtp_parsers.h +++ b/src/libmime/smtp_parsers.h @@ -18,6 +18,7 @@ #include "config.h" #include "email_addr.h" +#include "content_type.h" #include "task.h" #include "message.h" @@ -30,4 +31,7 @@ void rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, GByteArray *data, gboolean is_html, guint *newlines_count, GPtrArray *newlines); +gboolean rspamd_content_type_parser (const char *data, size_t len, + struct rspamd_content_type *ct, rspamd_mempool_t *pool); + #endif /* SRC_LIBMIME_SMTP_PARSERS_H_ */ diff --git a/src/ragel/content_type.rl b/src/ragel/content_type.rl new file mode 100644 index 0000000000..d9c222e5c1 --- /dev/null +++ b/src/ragel/content_type.rl @@ -0,0 +1,40 @@ +%%{ + machine content_type; + include smtp_whitespace "smtp_whitespace.rl"; + + # https://tools.ietf.org/html/rfc2045#section-5.1 + + ccontent = ctext | FWS | '(' @{ fcall balanced_ccontent; }; + balanced_ccontent := ccontent* ')' @{ fret; }; + comment = "(" (FWS? ccontent)* FWS? ")"; + CFWS = ((FWS? comment)+ FWS?) | FWS; + qcontent = qtextSMTP | quoted_pairSMTP; + quoted_string = CFWS? + (DQUOTE + (((FWS? qcontent)* FWS?) >Quoted_Str_Start %Quoted_Str_End) + DQUOTE) CFWS?; + token = 0x21..0x27 | 0x2a..0x2b | 0x2c..0x2e | 0x30..0x39 | 0x41..0x5a | 0x5e..0x7e; + value = (quoted_string | (token -- '"' | 0x3d)+) >Param_Value_Start %Param_Value_End; + attribute = (token+) >Param_Name_Start %Param_Name_End; + parameter = CFWS? attribute "=" value CFWS?; + + ietf_token = token+; + custom_x_token = 'x'i "-" token+; + extension_token = ietf_token | custom_x_token; + discrete_type = 'text'i | 'image'i | 'audio'i | 'video'i | + 'application'i | extension_token; + composite_type = 'message'i | 'multipart'i | extension_token; + iana_token = token+; + main_type = (discrete_type | composite_type) >Type_Start %Type_End; + sub_type = (extension_token | iana_token) >Subtype_Start %Subtype_End; + content_type = main_type ("/" sub_type)? (((CFWS? ";"+) | CFWS) parameter CFWS?)*; + + prepush { + if (top >= st_storage.size) { + st_storage.size = (top + 1) * 2; + st_storage.data = realloc (st_storage.data, st_storage.size * sizeof (int)); + g_assert (st_storage.data != NULL); + stack = st_storage.data; + } + } +}%% \ No newline at end of file diff --git a/src/ragel/content_type_parser.rl b/src/ragel/content_type_parser.rl new file mode 100644 index 0000000000..aec3db2874 --- /dev/null +++ b/src/ragel/content_type_parser.rl @@ -0,0 +1,152 @@ +%%{ + machine content_type_parser; + + action Type_Start { + qstart = NULL; + qend = NULL; + ct->type.begin = p; + } + + action Type_End { + if (qstart) { + ct->type.begin = qstart; + } + if (qend && qend >= qstart) { + ct->type.len = qend - qstart; + } + else if (p >= ct->type.begin) { + ct->type.len = p - ct->type.begin; + } + qstart = NULL; + qend = NULL; + } + + action Subtype_Start { + qstart = NULL; + qend = NULL; + ct->subtype.begin = p; + } + + action Subtype_End { + if (qstart) { + ct->subtype.begin = qstart; + } + if (qend && qend >= qstart) { + ct->subtype.len = qend - qstart; + } + else if (p >= ct->subtype.begin) { + ct->subtype.len = p - ct->subtype.begin; + } + qstart = NULL; + qend = NULL; + } + + action Param_Name_Start { + qstart = NULL; + qend = NULL; + pname_start = p; + pname_end = NULL; + } + + + action Param_Name_End { + if (qstart) { + pname_start = qstart; + } + if (qend && qend >= qstart) { + pname_end = qend; + } + else if (p >= pname_start) { + pname_end = p; + } + qstart = NULL; + qend = NULL; + } + + + action Param_Value_Start { + qstart = NULL; + qend = NULL; + + if (pname_end) { + pvalue_start = p; + pvalue_end = NULL; + } + } + + + action Param_Value_End { + if (pname_end) { + if (qstart) { + pvalue_start = qstart; + } + if (qend && qend >= qstart) { + pvalue_end = qend; + } + else if (p >= pvalue_start) { + pvalue_end = p; + } + qstart = NULL; + qend = NULL; + + if (pvalue_end && pvalue_end > pvalue_start && pname_end > pname_start) { + rspamd_content_type_add_param (pool, ct, pname_start, pname_end, pvalue_start, pvalue_end); + } + } + + pname_start = NULL; + pname_end = NULL; + pvalue_start = NULL; + pvalue_end = NULL; + qend = NULL; + qstart = NULL; + } + + action Quoted_Str_Start { + qstart = p; + qend = NULL; + } + + action Quoted_Str_End { + if (qstart) { + qend = p; + } + } + + + include content_type "content_type.rl"; + + main := content_type; + +}%% + +#include "smtp_parsers.h" +#include "content_type.h" + +%% write data; + +gboolean +rspamd_content_type_parser (const char *data, size_t len, struct rspamd_content_type *ct, rspamd_mempool_t *pool) +{ + const char *p = data, *pe = data + len, *eof, *qstart = NULL, *qend = NULL, + *pname_start = NULL, *pname_end = NULL, *pvalue_start, *pvalue_end; + int cs, *stack = NULL; + gsize top = 0; + struct _ragel_st_storage { + int *data; + gsize size; + } st_storage; + + memset (&st_storage, 0, sizeof (st_storage)); + memset (ct, 0, sizeof (*ct)); + eof = pe; + + %% write init; + %% write exec; + + if (st_storage.data) { + free (st_storage.data); + } + + return ct->type.len > 0; +} -- 2.47.3