From 49ae6e798310e5c4b96709db435a3714ea6468a8 Mon Sep 17 00:00:00 2001 From: Aki Tuomi Date: Wed, 23 Jul 2025 13:06:43 +0300 Subject: [PATCH] lib-regex: Add regex matching library --- configure.ac | 1 + src/Makefile.am | 1 + src/lib-regex/Makefile.am | 41 +++ src/lib-regex/dregex.h | 157 ++++++++++++ src/lib-regex/empty.c | 0 src/lib-regex/regex.c | 497 +++++++++++++++++++++++++++++++++++++ src/lib-regex/test-regex.c | 306 +++++++++++++++++++++++ src/lib/lib-event.c | 2 + 8 files changed, 1005 insertions(+) create mode 100644 src/lib-regex/Makefile.am create mode 100644 src/lib-regex/dregex.h create mode 100644 src/lib-regex/empty.c create mode 100644 src/lib-regex/regex.c create mode 100644 src/lib-regex/test-regex.c diff --git a/configure.ac b/configure.ac index 73c2fa4970..c113617f5e 100644 --- a/configure.ac +++ b/configure.ac @@ -856,6 +856,7 @@ src/lib-settings/Makefile src/lib-smtp/Makefile src/lib-ssl-iostream/Makefile src/lib-test/Makefile +src/lib-regex/Makefile src/lib-storage/Makefile src/lib-storage/list/Makefile src/lib-storage/index/Makefile diff --git a/src/Makefile.am b/src/Makefile.am index c7532536f5..85064133ae 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -44,6 +44,7 @@ SUBDIRS = \ lib-dovecot \ $(LIB_LDAP) \ $(LIB_LUA) \ + lib-regex \ lib-language \ lib-imap-client \ lib-imap-urlauth \ diff --git a/src/lib-regex/Makefile.am b/src/lib-regex/Makefile.am new file mode 100644 index 0000000000..17ce730a1d --- /dev/null +++ b/src/lib-regex/Makefile.am @@ -0,0 +1,41 @@ +AM_CPPFLAGS = \ + -I$(top_srcdir)/src/lib \ + -I$(top_srcdir)/src/lib-test \ + -I$(top_srcdir)/src/lib-dict \ + -I$(top_srcdir)/src/lib-doveadm \ + -I$(top_srcdir)/src/lib-dns-client \ + -I$(top_srcdir)/src/lib-http \ + -I$(top_srcdir)/src/lib-ssl-iostream \ + -I$(top_srcdir)/src/lib-settings \ + -I$(top_srcdir)/src/lib-master \ + -I$(top_srcdir)/src/lib-var-expand \ + $(LIBPCRE_CFLAGS) + +headers = \ + dregex.h + +pkginc_libdir=$(pkgincludedir) +pkginc_lib_HEADERS = $(headers) + +noinst_LTLIBRARIES=libdregex.la + +if BUILD_LIBREGEX +libdregex_la_SOURCES = regex.c +libdregex_la_LIBADD = $(LIBPCRE_LIBS) + +EXTRA_DIST = + +test_programs = test-regex + +noinst_PROGRAMS = $(test_programs) + +test_regex_SOURCES = test-regex.c +test_regex_LDADD = libdregex.la \ + ../lib-test/libtest.la \ + ../lib/liblib.la \ + $(LIBPCRE_LIBS) +test_regex_DEPENDENCIES = libdregex.la $(LIBPCRE_LIBS) + +else +libdregex_la_SOURCES = empty.c +endif diff --git a/src/lib-regex/dregex.h b/src/lib-regex/dregex.h new file mode 100644 index 0000000000..a40ab5cfb5 --- /dev/null +++ b/src/lib-regex/dregex.h @@ -0,0 +1,157 @@ +#ifndef DREGEX_H +#define DREGEX_H 1 + +enum dregex_flags { + /* Match only at the first position */ + DREGEX_ANCHORED = BIT(0), + /* Do not create automatic capture groups */ + DREGEX_NOSUB = BIT(1), + /* Case insensitive matching */ + DREGEX_ICASE = BIT(2), + /* ^ and $ match newlines within data */ + DREGEX_NEWLINE = BIT(3), + /* Subject string is not the beginning of a line */ + DREGEX_NOTBOL = BIT(4), + /* Subject string is not the end of a line */ + DREGEX_NOTEOL = BIT(5), + /* Reject non-ascii strings */ + DREGEX_ASCII_ONLY = BIT(6), + /* Extended regular expression, skip whitespace and ignore comments, + * see https://www.pcre.org/current/doc/html/pcre2api.html */ + DREGEX_EXTENDED = BIT(7), + /* Skip empty match groups */ + DREGEX_NO_EMPTY_SUB = BIT(8), + + /* Perform global replace */ + DREGEX_REPLACE_ALL = BIT(9), + /* Replacement string is literal */ + DREGEX_REPLACE_LITERAL = BIT(10), +}; + +struct dregex_params { + unsigned int max_cpu_seconds; /* maximum execution time, 1s default */ + unsigned int max_capture_groups; /* maximum number of capture groups, 100 default */ + unsigned int max_depth; /* maximum stack depth, 100 default */ +}; + +/* Matches the given regular expression pattern against the subject string. + * + * Both pattern and subject are converted to UCS4 internally, making this UTF-8 safe. + * + * Returns: + * - -1 on error (with error_r optionally set to an error message) + * - 0 if the pattern does not match + * - 1 if the pattern matches + */ +int dregex_match(const char *pattern, const char *subject, enum dregex_flags flags, + const char **error_r); + +int dregex_match_groups(const char *pattern, const char *subject, enum dregex_flags flags, + ARRAY_TYPE(const_string) *groups_r, const char **error_r); + +/* Performs a regular expression-based substitution on the subject string. + * Replaces matches of 'pattern' with 'replace' and stores the result in *result_r. + * + * Both pattern, subject and replace are converted to UCS4 internally, making this UTF-8 safe. + * Result will be allocated from the datastack pool. + * + * Returns: + * - -1 on error (with error_r optionally set to an error message) + * - 0 if no substitution was performed (no match) + * - 1 if substitution was successful + */ +int dregex_replace(const char *pattern, const char *subject, const char *replace, + string_t *result_r, enum dregex_flags flags, + const char **error_r); + +struct dregex_code; + +/* Creates a new regular expression context. This context + * can be reused by calling code_compile again, which will + * clear the old pattern. +*/ +struct dregex_code *dregex_code_create(void); +struct dregex_code *dregex_code_create_params(const struct dregex_params *params); + +/* Frees the regular expression context. */ +void dregex_code_free(struct dregex_code **_code); + +/* Compiles the given pattern into reusable code. + * + * Pattern is converted to UCS4 internally, making this UTF-8 safe. + */ +int dregex_code_compile(struct dregex_code *code, const char *pattern, + enum dregex_flags flags, const char **error_r); + +/* Exports the compiled pattern into the given buffer. */ +void dregex_code_export(const struct dregex_code *code, buffer_t *buffer); + +/* Imports a compiled pattern from the given buffer. */ +int dregex_code_import(struct dregex_code *code, const buffer_t *buffer, + const char **error_r); + +/* Executes regex matching with capture groups using precompiled code. + * Same as dregex_match_groups(). + * + * Subject is converted to UCS4 internally, making this UTF-8 safe. + * + * Groups are converted from UCS4 to UTF-8 internally. + */ +int dregex_code_match_groups(struct dregex_code *code, const char *subject, + ARRAY_TYPE(const_string) *groups_r, const char **error_r); + +/* Executes regex matching using precompiled code. + * Same as dregex_match(). + * + * Subject is converted to UCS4 internally, making this UTF-8 safe. + */ +int dregex_code_match(struct dregex_code *code, const char *subject, + const char **error_r); + +/* Performs regex replacement using precompiled code, starting at given offset. + * Same as dregex_replace(). + * + * Subject and replacement are converted to UCS4 internally, making this UTF-8 safe. + * Result will be allocated from the datastack pool. + */ +int dregex_code_replace_full(struct dregex_code *code, + const char *subject, size_t startoffset, + const char *replacement, + string_t *result_r, enum dregex_flags flags, + const char **error_r); + +/* Performs regex replacement using precompiled code. + * Same as dregex_replace(). + * + * Subject is converted to UCS4 internally, making this UTF-8 safe. + * Result will be allocated from the datastack pool. + */ +int dregex_code_replace(struct dregex_code *code, + const char *subject, const char *replacement, + string_t *result_r, enum dregex_flags flags, + const char **error_r); + +#ifndef HAVE_LIBPCRE +# define NO_DREGEX_SUPPORT "Missing regular expression support" +# define NO_DREGEX_SUPPORT_CODE(error_r) \ + ({STMT_START { *(error_r) = NO_DREGEX_SUPPORT;} STMT_END; -1;}) +# define dregex_match(pattern, subject, flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_match_groups(pattern, subject, flags, groups_r, error_r) \ + NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_replace(pattern, subject, replace, result_r, \ + flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_code_create() ({ NULL; }) +# define dregex_code_free(code) +# define dregex_code_compile(code, pattern, flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_code_export(code, buffer) +# define dregex_code_import(code, buffer, error_r) NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_code_match_groups(code, subject, groups_r, error_r) \ + NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_code_match(code, subject, error_r) NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_code_replace_full(code, subject, startoffset, replacement, result_r, \ + flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r) +# define dregex_code_replace(code, subject, replacement, result_r, flags, error_r) \ + NO_DREGEX_SUPPORT_CODE(error_r) +#endif + +#endif diff --git a/src/lib-regex/empty.c b/src/lib-regex/empty.c new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/lib-regex/regex.c b/src/lib-regex/regex.c new file mode 100644 index 0000000000..b8be11afa1 --- /dev/null +++ b/src/lib-regex/regex.c @@ -0,0 +1,497 @@ +/* Copyright (C) 2025 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "array.h" +#include "buffer.h" +#include "cpu-limit.h" +#include "str.h" +#include "unichar.h" +#include "dregex.h" + +#ifdef HAVE_LIBPCRE + +#define PCRE2_CODE_UNIT_WIDTH 32 +#include "pcre2.h" + +#define DREGEX_MAX_DEPTH 100 +#define DREGEX_MAX_MATCHES 100 +#define DREGEX_MAX_CPU_SECONDS 1 + +struct dregex_code { + pool_t pool; + + pcre2_compile_context *cctx; + pcre2_general_context *gctx; + pcre2_match_context *mctx; + pcre2_code *pat; + + struct cpu_limit *climit; + + unsigned int max_depth; + unsigned int max_cpu_seconds; + unsigned int max_capture_groups; + + enum dregex_flags flags; +}; + +static void *dregex_code_int_malloc(size_t amt, void *_ctx) +{ + struct dregex_code *ctx = _ctx; + return p_malloc(ctx->pool, amt); +} + +static void dregex_code_int_free(void *ptr, void *_ctx) +{ + struct dregex_code *ctx = _ctx; + p_free(ctx->pool, ptr); +} + +static int dregex_code_callout(pcre2_callout_block *block ATTR_UNUSED, void *ctx) +{ + struct dregex_code *code = ctx; + if (cpu_limit_exceeded(code->climit)) + return PCRE2_ERROR_PATTERN_TOO_COMPLICATED; + return 0; +} + +static int +dregex_code_substitute_callout(pcre2_substitute_callout_block *block ATTR_UNUSED, void *ctx) +{ + return dregex_code_callout(NULL, ctx); +} + +static int dregex_code_guard(uint depth, void *ctx) +{ + struct dregex_code *code = ctx; + if (code->max_depth < depth) + return PCRE2_ERROR_DEPTHLIMIT; + return 0; +} + +static void dregex_code_init(struct dregex_code *code) +{ + code->gctx = pcre2_general_context_create(dregex_code_int_malloc, + dregex_code_int_free, code); + code->cctx = pcre2_compile_context_create(code->gctx); + code->mctx = pcre2_match_context_create(code->gctx); + + pcre2_set_compile_recursion_guard(code->cctx, dregex_code_guard, code); + /* these are used to ensure that CPU time isn't exceeded */ + pcre2_set_callout(code->mctx, dregex_code_callout, code); + pcre2_set_substitute_callout(code->mctx, dregex_code_substitute_callout, code); + + /* Set some limits */ + pcre2_set_match_limit(code->mctx, code->max_capture_groups); + pcre2_set_depth_limit(code->mctx, code->max_depth); +} + +struct dregex_code *dregex_code_create_params(const struct dregex_params *params) +{ + pool_t pool = pool_allocfree_create("regex pool"); + struct dregex_code *code = p_new(pool, struct dregex_code, 1); + code->pool = pool; + code->max_capture_groups = params->max_capture_groups; + code->max_cpu_seconds = params->max_cpu_seconds; + code->max_depth = params->max_depth; + dregex_code_init(code); + return code; +} + +static const struct dregex_params default_params = { + .max_depth = DREGEX_MAX_DEPTH, + .max_cpu_seconds = DREGEX_MAX_CPU_SECONDS, + .max_capture_groups = DREGEX_MAX_MATCHES, +}; + +struct dregex_code *dregex_code_create(void) +{ + struct dregex_code *code = dregex_code_create_params(&default_params); + dregex_code_init(code); + return code; +} + +static const PCRE2_SPTR empty_str = U""; + +/* Convert input into unichars */ +static int convert_to_sptr(const char *input, PCRE2_SPTR *out_r, PCRE2_SIZE *len_r, + bool refuse_non_ascii) +{ + if (*input == '\0') { + *len_r = 0; + *out_r = empty_str; + } + ARRAY_TYPE(unichars) chars; + t_array_init(&chars, 128); + if (refuse_non_ascii) { + /* treat everything as ascii */ + for (; *input != '\0'; input++) { + unichar_t chr = (unsigned char)*input; + array_push_back(&chars, &chr); + } + } else if (uni_utf8_to_ucs4(input, &chars) < 0) + return -1; + *len_r = array_count(&chars); + if (*len_r == 0) + *out_r = empty_str; + else + *out_r = array_idx(&chars, 0); + return 0; +} + +/* Handle error */ +static int handle_error(int ret, const char *func, const char **error_r) +{ + PCRE2_UCHAR buf[256]; + if (ret == PCRE2_ERROR_NOMEMORY) + i_fatal_status(FATAL_OUTOFMEM, "%s(): Out of memory", func); + int rc = pcre2_get_error_message(ret, buf, sizeof(buf)); + /* Ignore, the error didn't fit to buffer */ + if (rc == PCRE2_ERROR_BADDATA) { + *error_r = t_strdup_printf("Unknown error %d occured", ret); + } else if (rc < 0) { + *error_r = t_strdup_printf("Unknown error %d occured while handling %d", + rc, ret); + } else { + /* we are ignoring PCRE2_ERROR_NOMEMORY here because it + * likely means the output did not fit in 256 characters. */ + buffer_t *output = t_buffer_create(rc); + uni_ucs4_to_utf8(buf, rc, output); + *error_r = str_c(output); + } + return -1; +} +#define handle_error(ret, error_r) handle_error((ret), __func__, (error_r)) + +int dregex_code_compile(struct dregex_code *code, const char *pattern, + enum dregex_flags flags, const char **error_r) +{ + i_assert(code != NULL); + i_assert(pattern != NULL); + int errcode; + PCRE2_SIZE erroffset; + + if (code->pat != NULL) { + pcre2_code_free(code->pat); + code->pat = NULL; + code->flags = 0; + } + + uint options = PCRE2_AUTO_CALLOUT | + PCRE2_NEVER_BACKSLASH_C | PCRE2_NO_UTF_CHECK; + + if (HAS_ALL_BITS(flags, DREGEX_ICASE)) + options |= PCRE2_CASELESS; + if (HAS_ALL_BITS(flags, DREGEX_NOSUB)) + options |= PCRE2_NO_AUTO_CAPTURE; + if (HAS_ALL_BITS(flags, DREGEX_NEWLINE)) + options |= PCRE2_MULTILINE; + if (HAS_ALL_BITS(flags, DREGEX_ANCHORED)) + options |= PCRE2_ANCHORED; + if (HAS_ALL_BITS(flags, DREGEX_EXTENDED)) + options |= PCRE2_EXTENDED; + + bool refuse_non_ascii = HAS_ALL_BITS(flags, DREGEX_ASCII_ONLY); + code->flags = flags; + + /* Use Unicode properties for character matching */ + if (!refuse_non_ascii) + options |= (PCRE2_UCP | PCRE2_UTF); + else + options |= PCRE2_NEVER_UTF; + + T_BEGIN { + PCRE2_SIZE slen; + PCRE2_SPTR32 pattern32; + if (convert_to_sptr(pattern, &pattern32, &slen, refuse_non_ascii) < 0) { + errcode = PCRE2_ERROR_BADDATA; + code->pat = NULL; + } else { + code->pat = pcre2_compile(pattern32, slen, options, &errcode, + &erroffset, code->cctx); + } + } T_END; + + i_assert(code->pat != NULL || errcode != 0); + + if (code->pat == NULL) + return handle_error(errcode, error_r); + + return 0; +} + +void dregex_code_export(const struct dregex_code *code, buffer_t *buffer) +{ + PCRE2_SIZE size; + uint8_t *bytes; + + const pcre2_code *codes[] = { + code->pat, + }; + + int ret = pcre2_serialize_encode(codes, N_ELEMENTS(codes), &bytes, + &size, code->gctx); + if (ret < 0) { + const char *error; + (void)handle_error(ret, &error); + i_panic("BUG: dregex_code_export(): %s", error); + } + + /* There must be only one pattern */ + i_assert(ret == 1); + + buffer_append(buffer, bytes, size); + pcre2_serialize_free(bytes); +} + +int dregex_code_import(struct dregex_code *code, const buffer_t *buffer, + const char **error_r) +{ + int ret = pcre2_serialize_decode(&code->pat, 1, buffer->data, + code->gctx); + if (ret < 0) + return handle_error(ret, error_r); + i_assert(ret > 0); + + return 0; +} + +static const char *empty_match_str = ""; + +static void extract_matches(uint count, pcre2_match_data *mdata, + bool skip_empty, ARRAY_TYPE(const_string) *groups_r) +{ + /* we don't actually want matches */ + if (groups_r == NULL) + return; + for (uint i = 0; i < count; i++) { + PCRE2_UCHAR32 *buf; + PCRE2_SIZE bsize; + int rc = pcre2_substring_length_bynumber(mdata, i, &bsize); + if (rc == PCRE2_ERROR_NOSUBSTRING) + break; + else if (rc == PCRE2_ERROR_UNSET) { + if (!skip_empty) + array_push_back(groups_r, &empty_match_str); + continue; + } else if (rc == PCRE2_ERROR_UNAVAILABLE) + continue; + pcre2_substring_get_bynumber(mdata, i, &buf, &bsize); + buffer_t *output = t_buffer_create(bsize); + uni_ucs4_to_utf8(buf, bsize, output); + const char *substr = str_c(output); + array_push_back(groups_r, &substr); + } +} + +static int dregex_code_match_int(struct dregex_code *code, const char *subject, + pcre2_match_data *mdata, const char **error_r) +{ + i_assert(code != NULL); + i_assert(code->pat != NULL); + i_assert(subject != NULL); + + PCRE2_SIZE slen; + PCRE2_SPTR subject32; + + bool refuse_non_ascii = HAS_ALL_BITS(code->flags, DREGEX_ASCII_ONLY); + if (convert_to_sptr(subject, &subject32, &slen, refuse_non_ascii) < 0) + return handle_error(PCRE2_ERROR_BADDATA, error_r); + + /* Empty string is not a match */ + uint options = PCRE2_NOTEMPTY; + + if (HAS_ALL_BITS(code->flags, DREGEX_NOTBOL)) + options |= PCRE2_NOTBOL; + if (HAS_ALL_BITS(code->flags, DREGEX_NOTEOL)) + options |= PCRE2_NOTEOL; + if (HAS_ALL_BITS(code->flags, DREGEX_ANCHORED)) + options |= PCRE2_ANCHORED; + + code->climit = cpu_limit_init(code->max_cpu_seconds, CPU_LIMIT_TYPE_ALL); + int ret = pcre2_match(code->pat, subject32, slen, 0, options, + mdata, code->mctx); + cpu_limit_deinit(&code->climit); + + if (ret == PCRE2_ERROR_NOMATCH) { + /* did not match */ + ret = 0; + } else if (ret < 0) { + return handle_error(ret, error_r); + } + + return ret; +} + +int dregex_code_match_groups(struct dregex_code *code, const char *subject, + ARRAY_TYPE(const_string) *groups_r, const char **error_r) +{ + i_assert(code != NULL); + i_assert(code->pat != NULL); + int ret; + + T_BEGIN { + pcre2_match_data *mdata = + pcre2_match_data_create_from_pattern(code->pat, code->gctx); + ret = dregex_code_match_int(code, subject, mdata, error_r); + if (ret > 1) { + bool skip_empty = HAS_ALL_BITS(code->flags, DREGEX_NO_EMPTY_SUB); + /* ret is number of groups */ + extract_matches((uint32_t)ret, mdata, skip_empty, groups_r); + ret = 1; + } + } T_END_PASS_STR_IF(ret < 0, error_r); + return ret; +} + +int dregex_code_match(struct dregex_code *code, const char *subject, + const char **error_r) +{ + return dregex_code_match_groups(code, subject, NULL, error_r); +} + +int dregex_code_replace_full(struct dregex_code *code, + const char *subject, size_t startoffset, + const char *replacement, string_t *result_r, + enum dregex_flags flags, const char **error_r) +{ + i_assert(code != NULL); + i_assert(code->pat != NULL); + i_assert(subject != NULL); + i_assert(replacement != NULL); + i_assert(result_r != NULL); + + uint options = PCRE2_NOTEMPTY; + if (HAS_ALL_BITS(flags, PCRE2_ANCHORED)) + options |= PCRE2_ANCHORED; + if (HAS_ALL_BITS(flags, DREGEX_REPLACE_ALL)) + options |= PCRE2_SUBSTITUTE_GLOBAL; + if (HAS_ALL_BITS(flags, DREGEX_REPLACE_LITERAL)) + options |= PCRE2_SUBSTITUTE_LITERAL; + + PCRE2_UCHAR *result32 = U""; + PCRE2_SIZE result_len = 0; + + int ret; + bool refuse_non_ascii = HAS_ALL_BITS(flags, DREGEX_ASCII_ONLY) || + HAS_ALL_BITS(code->flags, DREGEX_ASCII_ONLY); + + T_BEGIN do { + PCRE2_SIZE slen; + PCRE2_SPTR subject32; + PCRE2_SIZE rlen; + PCRE2_SPTR replacement32; + + if (convert_to_sptr(subject, &subject32, &slen, refuse_non_ascii) < 0 || + convert_to_sptr(replacement, &replacement32, &rlen, refuse_non_ascii) < 0) { + ret = PCRE2_ERROR_BADDATA; + break; + } + + pcre2_match_data *mdata = + pcre2_match_data_create_from_pattern(code->pat, code->gctx); + + code->climit = cpu_limit_init(code->max_cpu_seconds, + CPU_LIMIT_TYPE_ALL); + ret = pcre2_substitute(code->pat, subject32, slen, startoffset, + options|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, + mdata, code->mctx, replacement32, rlen, + result32, &result_len); + cpu_limit_deinit(&code->climit); + /* Ignore NOMEMORY error here, it's because we asked how long + the result would be. */ + if (ret != PCRE2_ERROR_NOMEMORY && ret < 0) { + pcre2_match_data_free(mdata); + break; + } + + if (result_len > 0) + result32 = t_new(PCRE2_UCHAR, result_len); + + /* Run it again as we know the buffer size now */ + code->climit = cpu_limit_init(code->max_cpu_seconds, + CPU_LIMIT_TYPE_ALL); + ret = pcre2_substitute(code->pat, subject32, slen, startoffset, options, + mdata, code->mctx, replacement32, rlen, + result32, &result_len); + cpu_limit_deinit(&code->climit); + pcre2_match_data_free(mdata); + } while(0); T_END; + + if (ret < 0) + return handle_error(ret, error_r); + else if (ret > 0) + uni_ucs4_to_utf8(result32, result_len, result_r); + + return ret > 0 ? 1 : 0; +} + +int dregex_code_replace(struct dregex_code *code, const char *subject, + const char *replacement, string_t *result_r, + enum dregex_flags flags, const char **error_r) +{ + return dregex_code_replace_full(code, subject, 0, replacement, result_r, + flags, error_r); +} + +void dregex_code_free(struct dregex_code **_code) +{ + struct dregex_code *code = *_code; + *_code = NULL; + if (code == NULL) + return; + + if (code->pat != NULL) + pcre2_code_free(code->pat); + pcre2_match_context_free(code->mctx); + pcre2_compile_context_free(code->cctx); + pcre2_general_context_free(code->gctx); + pool_unref(&code->pool); +} + +int dregex_match_groups(const char *pattern, const char *subject, enum dregex_flags flags, + ARRAY_TYPE(const_string) *groups_r, const char **error_r) +{ + struct dregex_code *code = dregex_code_create(); + int ret; + + T_BEGIN { + if (dregex_code_compile(code, pattern, flags, error_r) < 0) + ret = -1; + else { + ret = dregex_code_match_groups(code, subject, groups_r, + error_r); + } + } T_END_PASS_STR_IF(ret < 0, error_r); + dregex_code_free(&code); + + return ret; +} + +int dregex_match(const char *pattern, const char *subject, enum dregex_flags flags, + const char **error_r) +{ + return dregex_match_groups(pattern, subject, flags, NULL, error_r); +} + +int dregex_replace(const char *pattern, const char *subject, const char *replace, + string_t *result_r, enum dregex_flags flags, + const char **error_r) +{ + struct dregex_code *code = dregex_code_create(); + int ret; + + T_BEGIN { + ret = dregex_code_compile(code, pattern, flags, error_r); + } T_END_PASS_STR_IF(ret < 0, error_r); + + if (ret >= 0) { + ret = dregex_code_replace(code, subject, replace, result_r, + flags, error_r); + } + + dregex_code_free(&code); + + return ret; +} + +#endif diff --git a/src/lib-regex/test-regex.c b/src/lib-regex/test-regex.c new file mode 100644 index 0000000000..ef2d3bd8cf --- /dev/null +++ b/src/lib-regex/test-regex.c @@ -0,0 +1,306 @@ +/* Copyright (C) 2025 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "test-common.h" +#include "array.h" +#include "str.h" +#include "dregex.h" + +#ifdef HAVE_LIBPCRE + +static const bool debug = FALSE; + +struct test_case { + const char *subject; + const char *pattern; + const char *replacement; + const char *result; + const char *error; + enum dregex_flags flags; + int compile_ret; + int match_ret; +}; + +static void run_match_tests(const struct test_case *cases) +{ + unsigned int idx; + struct dregex_code *code = dregex_code_create(); + + for(idx = 0; cases[idx].pattern != NULL; idx++) { + const char *error = NULL; + const struct test_case *test = &cases[idx]; + + if (debug) { + i_debug("pattern = %s, subject = %s", test->pattern, + test->subject); + } + + /* compile pattern */ + int ret = dregex_code_compile(code, test->pattern, test->flags, + &error); + test_assert_cmp_idx(test->compile_ret, ==, ret, idx); + if (test->compile_ret < 0) { + test_assert_strcmp_idx(test->error, error, idx); + continue; + } else if (ret < 0) { + error = t_strdup_printf("Unexpected error: %s", error); + test_assert_failed_idx(error, __FILE__, __LINE__, idx); + continue; + } + + ret = dregex_code_match(code, test->subject, &error); + + test_assert_cmp_idx(test->match_ret, ==, ret, idx); + if (test->match_ret < 0) + test_assert_strcmp_idx(test->error, error, idx); + else if (ret < 0) { + error = t_strdup_printf("Unexpected error: %s", error); + test_assert_failed_idx(error, __FILE__, __LINE__, idx); + continue; + } + } + dregex_code_free(&code); +} + +#define MATCH_CASE_FULL(pat, sub, err, cret, mret) \ + { \ + .pattern = (pat), \ + .subject = (sub), \ + .replacement = NULL, \ + .result = NULL, \ + .error = (err), \ + .compile_ret = (cret), \ + .match_ret = (mret) \ + } +#define MATCH_CASE(pattern, subject) MATCH_CASE_FULL(pattern, subject, NULL, 0, 1) +#define MATCH_CASE_END { .pattern = NULL } + +#define STR(x) x +#define REP(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) +#define REP10(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) + +static void test_dregex_match(void) +{ + const struct test_case cases[] = { + /* simple test case */ + MATCH_CASE(".*", "hello world"), + /* .* matches empty string */ + MATCH_CASE_FULL(".*", "", NULL, 0, 0), + /* but empty string does not match empty string */ + MATCH_CASE_FULL("", "", NULL, 0, 0), + /* Match any single character except newline. */ + MATCH_CASE(".", "a"), + MATCH_CASE_FULL(".", "\n", NULL, 0, 0), + /* Bracket expression. Match any one of the enclosed + characters. A hypen (-) indicates a range of + consecutive characters. */ + MATCH_CASE("[a-z]", "a"), + MATCH_CASE_FULL("[a-z]", "A", NULL, 0, 0), + /* Negated bracket expression. */ + MATCH_CASE("[^a-z]", "A"), + MATCH_CASE_FULL("[^a-z]", "a", NULL, 0, 0), + /* Character class */ + MATCH_CASE("^[[:alnum:]]+$", "abc123"), + MATCH_CASE_FULL("^[[^:alnum:]]+$", "abc123", NULL, 0, 0), + /* Unicode properties */ + MATCH_CASE("^\\p{L}$", "\xc3\xab"), + MATCH_CASE("^\\pL$", "\xc3\xab"), + /* Quantifiers */ + MATCH_CASE("^.$", "h"), + MATCH_CASE("^.{2}$", "he"), + MATCH_CASE("^.{2,3}$", "he"), + MATCH_CASE("^.{2,3}$", "hel"), + MATCH_CASE("^.+$", "hello"), + MATCH_CASE_FULL("^.+$", "", NULL, 0, 0), + /* Alternation and grouping */ + MATCH_CASE("^(hello|world)$", "hello"), + MATCH_CASE("^(hello|world)$", "world"), + MATCH_CASE_FULL("^(hello|world)$", "hi", NULL, 0, 0), + /* test that we can find 'mojiretsu' (test string) from + 'Kore wa tesuto mojiretsudesu.' (this is a test string) */ + MATCH_CASE( + "\xe6\x96\x87\xe5\xad\x97\xe5\x88\x97", + "\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xaf\xe3\x83\x86\xe3" + "\x82\xb9\xe3\x83\x88\xe6\x96\x87\xe5\xad\x97\xe5\x88" + "\x97\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82" + ), + /* test that we can find from */ + MATCH_CASE("\xef\x85\xa0""A", "\xef\x85\xa0""0\xef\x85\xa0""A"), + /* binary matching */ + { + .pattern = "\xef\x85\xa0""A", + .subject = "\xef\x85\xa0""0\xef\x85\xa0""A", + .error = "", + .flags = DREGEX_ASCII_ONLY, + .compile_ret = 0, + .match_ret = 1, + }, + { + .pattern = ".*", + .subject = "\xef\x85\xa0""0\xef\x85\xa0""A", + .error = "", + .flags = DREGEX_ASCII_ONLY, + .compile_ret = 0, + .match_ret = 1, + }, + /* invalid utf-8 */ + MATCH_CASE_FULL(".*", "\xc2\xc2", "bad data value", 0, -1), + /* two evil patterns */ + MATCH_CASE_FULL( + "^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@)" + "{1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]" + "{1}[a-z]{2,3}))$", + "thisisabstractly.andtotally.long.email@" + REP10("a") "." REP10("a") "." REP10("a") + ".has", + "match limit exceeded", + 0, + -1 + ), + MATCH_CASE_FULL( + "(a|a?)+", + REP10("a") REP10("a"), + "match limit exceeded", + 0, + -1 + ), + /* IEEE.1003-2.1992 */ + MATCH_CASE("me(\\+.*)?@company\\.com", + "me+hello@company.com"), + MATCH_CASE("^[^[:lower:]]+$", "HELLO"), + MATCH_CASE_FULL( + "^[^[:lower:]]+$", + "hello", + NULL, + 0, + 0 + ), + MATCH_CASE("<(.*)@", ""), + MATCH_CASE("^\\[(.*)\\] (.*)$", "[acme-users] [fwd]: hello, world"), + MATCH_CASE_END + }; + + test_begin("matching"); + + run_match_tests(cases); + + test_end(); +} + +static void run_replace_tests(const struct test_case *cases) +{ + unsigned int idx; + struct dregex_code *code = dregex_code_create(); + string_t *dest = t_str_new(32); + + for(idx = 0; cases[idx].pattern != NULL; idx++) { + const char *error = NULL; + const struct test_case *test = &cases[idx]; + str_truncate(dest, 0); + + if (debug) { + i_debug("pattern = %s, subject = %s, " + "replacement = %s, result = %s", + test->pattern, test->subject, + test->replacement, test->result); + } + + /* compile pattern */ + int ret = dregex_code_compile(code, test->pattern, test->flags, + &error); + test_assert_cmp_idx(test->compile_ret, ==, ret, idx); + if (test->compile_ret < 0) { + test_assert_strcmp_idx(test->error, error, idx); + continue; + } else if (ret < 0) { + error = t_strdup_printf("Unexpected error: %s", error); + test_assert_failed_idx(error, __FILE__, __LINE__, idx); + continue; + } + + ret = dregex_code_replace(code, test->subject, test->replacement, + dest, test->flags, &error); + + test_assert_cmp_idx(test->match_ret, ==, ret, idx); + if (test->match_ret < 0) { + test_assert_strcmp_idx(test->error, error, idx); + continue; + } else if (ret < 0) { + error = t_strdup_printf("Unexpected error: %s", error); + test_assert_failed_idx(error, __FILE__, __LINE__, idx); + continue; + } + test_assert_strcmp_idx(test->result, str_c(dest), idx); + } + dregex_code_free(&code); +} + +#define REP_CASE_FULL(pat, sub, rep, res, err, cret, mret) \ + { \ + .pattern = (pat), \ + .subject = (sub), \ + .replacement = (rep), \ + .result = (res), \ + .error = (err), \ + .compile_ret = (cret), \ + .match_ret = (mret) \ + } +#define REP_CASE(pattern, subject, replacement, result) \ + REP_CASE_FULL(pattern, subject, replacement, result, NULL, 0, 1) +#define REP_CASE_END { .pattern = NULL } + +static void test_dregex_replace(void) +{ + const struct test_case cases[] = { + /* simple replacement */ + REP_CASE(".*", "hello world", "world hello", "world hello"), + /* simple swap */ + REP_CASE("(.*) (.*)", "hello world", "$2 $1", "world hello"), + /* partial replace */ + REP_CASE("hello .*", "hello world", "$0", "hello world"), + /* simple utf-8 test, + * ' ' to ' ' */ + REP_CASE( + "(.*) (.*)", + "\xef\x85\xa0""0 \xef\x85\xa0""A", + "$2 $1", + "\xef\x85\xa0""A \xef\x85\xa0""0" + ), + /* Invalid back reference */ + REP_CASE_FULL( + "hello .*", + "hello world", + "$5", + "", + "unknown substring", + 0, + -1 + ), + REP_CASE_END + }; + + test_begin("replacing"); + + run_replace_tests(cases); + + test_end(); +} + +int main(void) +{ + void (*const tests[])(void) = { + test_dregex_match, + test_dregex_replace, + NULL + }; + + return test_run(tests); +} + +#else + +int main(void) { + return 0; +} + +#endif diff --git a/src/lib/lib-event.c b/src/lib/lib-event.c index cc1b7d71e5..1d5220222e 100644 --- a/src/lib/lib-event.c +++ b/src/lib/lib-event.c @@ -133,6 +133,8 @@ event_call_callbacks(struct event *event, enum event_callback_type type, if (event->disable_callbacks) return TRUE; + if (!array_is_created(&event_handlers)) + return TRUE; array_foreach_elem(&event_handlers, callback) { bool ret; -- 2.47.3