]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-regex: Add regex matching library
authorAki Tuomi <aki.tuomi@open-xchange.com>
Wed, 23 Jul 2025 10:06:43 +0000 (13:06 +0300)
committerAki Tuomi <aki.tuomi@open-xchange.com>
Fri, 29 Aug 2025 08:42:35 +0000 (11:42 +0300)
configure.ac
src/Makefile.am
src/lib-regex/Makefile.am [new file with mode: 0644]
src/lib-regex/dregex.h [new file with mode: 0644]
src/lib-regex/empty.c [new file with mode: 0644]
src/lib-regex/regex.c [new file with mode: 0644]
src/lib-regex/test-regex.c [new file with mode: 0644]
src/lib/lib-event.c

index 73c2fa4970490745142362a461390b0be4633334..c113617f5e7acab0791950a1eb1e8686f3395b53 100644 (file)
@@ -856,6 +856,7 @@ src/lib-settings/Makefile
 src/lib-smtp/Makefile
 src/lib-ssl-iostream/Makefile
 src/lib-test/Makefile
+src/lib-regex/Makefile
 src/lib-storage/Makefile
 src/lib-storage/list/Makefile
 src/lib-storage/index/Makefile
index c7532536f5ed6e40db8e62fb8b29c4086c86e3e6..85064133ae02d8b124131a3be1261d57675839b6 100644 (file)
@@ -44,6 +44,7 @@ SUBDIRS = \
        lib-dovecot \
        $(LIB_LDAP) \
        $(LIB_LUA) \
+       lib-regex \
        lib-language \
        lib-imap-client \
        lib-imap-urlauth \
diff --git a/src/lib-regex/Makefile.am b/src/lib-regex/Makefile.am
new file mode 100644 (file)
index 0000000..17ce730
--- /dev/null
@@ -0,0 +1,41 @@
+AM_CPPFLAGS = \
+       -I$(top_srcdir)/src/lib \
+       -I$(top_srcdir)/src/lib-test \
+       -I$(top_srcdir)/src/lib-dict \
+       -I$(top_srcdir)/src/lib-doveadm \
+       -I$(top_srcdir)/src/lib-dns-client \
+       -I$(top_srcdir)/src/lib-http \
+       -I$(top_srcdir)/src/lib-ssl-iostream \
+       -I$(top_srcdir)/src/lib-settings \
+       -I$(top_srcdir)/src/lib-master \
+       -I$(top_srcdir)/src/lib-var-expand \
+       $(LIBPCRE_CFLAGS)
+
+headers = \
+       dregex.h
+
+pkginc_libdir=$(pkgincludedir)
+pkginc_lib_HEADERS = $(headers)
+
+noinst_LTLIBRARIES=libdregex.la
+
+if BUILD_LIBREGEX
+libdregex_la_SOURCES = regex.c
+libdregex_la_LIBADD = $(LIBPCRE_LIBS)
+
+EXTRA_DIST =
+
+test_programs = test-regex
+
+noinst_PROGRAMS = $(test_programs)
+
+test_regex_SOURCES = test-regex.c
+test_regex_LDADD = libdregex.la \
+                  ../lib-test/libtest.la \
+                  ../lib/liblib.la \
+                  $(LIBPCRE_LIBS)
+test_regex_DEPENDENCIES = libdregex.la $(LIBPCRE_LIBS)
+
+else
+libdregex_la_SOURCES = empty.c
+endif
diff --git a/src/lib-regex/dregex.h b/src/lib-regex/dregex.h
new file mode 100644 (file)
index 0000000..a40ab5c
--- /dev/null
@@ -0,0 +1,157 @@
+#ifndef DREGEX_H
+#define DREGEX_H 1
+
+enum dregex_flags {
+       /* Match only at the first position */
+       DREGEX_ANCHORED = BIT(0),
+       /* Do not create automatic capture groups */
+       DREGEX_NOSUB = BIT(1),
+       /* Case insensitive matching */
+       DREGEX_ICASE = BIT(2),
+       /*  ^ and $ match newlines within data */
+       DREGEX_NEWLINE = BIT(3),
+       /* Subject string is not the beginning of a line */
+       DREGEX_NOTBOL = BIT(4),
+       /* Subject string is not the end of a line */
+       DREGEX_NOTEOL = BIT(5),
+       /* Reject non-ascii strings */
+       DREGEX_ASCII_ONLY = BIT(6),
+       /* Extended regular expression, skip whitespace and ignore comments,
+        * see https://www.pcre.org/current/doc/html/pcre2api.html */
+       DREGEX_EXTENDED = BIT(7),
+       /* Skip empty match groups */
+       DREGEX_NO_EMPTY_SUB = BIT(8),
+
+       /* Perform global replace */
+       DREGEX_REPLACE_ALL = BIT(9),
+       /* Replacement string is literal */
+       DREGEX_REPLACE_LITERAL = BIT(10),
+};
+
+struct dregex_params {
+       unsigned int max_cpu_seconds; /* maximum execution time, 1s default */
+       unsigned int max_capture_groups; /* maximum number of capture groups, 100 default */
+       unsigned int max_depth; /* maximum stack depth, 100 default */
+};
+
+/* Matches the given regular expression pattern against the subject string.
+ *
+ * Both pattern and subject are converted to UCS4 internally, making this UTF-8 safe.
+ *
+ * Returns:
+ *  - -1 on error (with error_r optionally set to an error message)
+ *  -  0 if the pattern does not match
+ *  -  1 if the pattern matches
+ */
+int dregex_match(const char *pattern, const char *subject, enum dregex_flags flags,
+                const char **error_r);
+
+int dregex_match_groups(const char *pattern, const char *subject, enum dregex_flags flags,
+                       ARRAY_TYPE(const_string) *groups_r, const char **error_r);
+
+/* Performs a regular expression-based substitution on the subject string.
+ * Replaces matches of 'pattern' with 'replace' and stores the result in *result_r.
+ *
+ * Both pattern, subject and replace are converted to UCS4 internally, making this UTF-8 safe.
+ * Result will be allocated from the datastack pool.
+ *
+ * Returns:
+ *  - -1 on error (with error_r optionally set to an error message)
+ *  -  0 if no substitution was performed (no match)
+ *  -  1 if substitution was successful
+ */
+int dregex_replace(const char *pattern, const char *subject, const char *replace,
+                  string_t *result_r, enum dregex_flags flags,
+                  const char **error_r);
+
+struct dregex_code;
+
+/* Creates a new regular expression context. This context
+ * can be reused by calling code_compile again, which will
+ * clear the old pattern.
+*/
+struct dregex_code *dregex_code_create(void);
+struct dregex_code *dregex_code_create_params(const struct dregex_params *params);
+
+/* Frees the regular expression context. */
+void dregex_code_free(struct dregex_code **_code);
+
+/* Compiles the given pattern into reusable code.
+ *
+ * Pattern is converted to UCS4 internally, making this UTF-8 safe.
+ */
+int dregex_code_compile(struct dregex_code *code, const char *pattern,
+                       enum dregex_flags flags, const char **error_r);
+
+/* Exports the compiled pattern into the given buffer. */
+void dregex_code_export(const struct dregex_code *code, buffer_t *buffer);
+
+/* Imports a compiled pattern from the given buffer. */
+int dregex_code_import(struct dregex_code *code, const buffer_t *buffer,
+                      const char **error_r);
+
+/* Executes regex matching with capture groups using precompiled code.
+ * Same as dregex_match_groups().
+ *
+ * Subject is converted to UCS4 internally, making this UTF-8 safe.
+ *
+ * Groups are converted from UCS4 to UTF-8 internally.
+ */
+int dregex_code_match_groups(struct dregex_code *code, const char *subject,
+                            ARRAY_TYPE(const_string) *groups_r, const char **error_r);
+
+/* Executes regex matching using precompiled code.
+ * Same as dregex_match().
+ *
+ * Subject is converted to UCS4 internally, making this UTF-8 safe.
+ */
+int dregex_code_match(struct dregex_code *code, const char *subject,
+                     const char **error_r);
+
+/* Performs regex replacement using precompiled code, starting at given offset.
+ * Same as dregex_replace().
+ *
+ * Subject and replacement are converted to UCS4 internally, making this UTF-8 safe.
+ * Result will be allocated from the datastack pool.
+ */
+int dregex_code_replace_full(struct dregex_code *code,
+                            const char *subject, size_t startoffset,
+                            const char *replacement,
+                            string_t *result_r, enum dregex_flags flags,
+                            const char **error_r);
+
+/* Performs regex replacement using precompiled code.
+ * Same as dregex_replace().
+ *
+ * Subject is converted to UCS4 internally, making this UTF-8 safe.
+ * Result will be allocated from the datastack pool.
+ */
+int dregex_code_replace(struct dregex_code *code,
+                       const char *subject, const char *replacement,
+                       string_t *result_r, enum dregex_flags flags,
+                       const char **error_r);
+
+#ifndef HAVE_LIBPCRE
+#  define NO_DREGEX_SUPPORT "Missing regular expression support"
+#  define NO_DREGEX_SUPPORT_CODE(error_r) \
+       ({STMT_START { *(error_r) = NO_DREGEX_SUPPORT;} STMT_END; -1;})
+#  define dregex_match(pattern, subject, flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_match_groups(pattern, subject, flags, groups_r, error_r) \
+       NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_replace(pattern, subject, replace, result_r, \
+                       flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_code_create() ({ NULL; })
+#  define dregex_code_free(code)
+#  define dregex_code_compile(code, pattern, flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_code_export(code, buffer)
+#  define dregex_code_import(code, buffer, error_r) NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_code_match_groups(code, subject, groups_r, error_r) \
+       NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_code_match(code, subject, error_r) NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_code_replace_full(code, subject, startoffset, replacement, result_r, \
+                                 flags, error_r) NO_DREGEX_SUPPORT_CODE(error_r)
+#  define dregex_code_replace(code, subject, replacement, result_r, flags, error_r) \
+       NO_DREGEX_SUPPORT_CODE(error_r)
+#endif
+
+#endif
diff --git a/src/lib-regex/empty.c b/src/lib-regex/empty.c
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/lib-regex/regex.c b/src/lib-regex/regex.c
new file mode 100644 (file)
index 0000000..b8be11a
--- /dev/null
@@ -0,0 +1,497 @@
+/* Copyright (C) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "buffer.h"
+#include "cpu-limit.h"
+#include "str.h"
+#include "unichar.h"
+#include "dregex.h"
+
+#ifdef HAVE_LIBPCRE
+
+#define PCRE2_CODE_UNIT_WIDTH 32
+#include "pcre2.h"
+
+#define DREGEX_MAX_DEPTH 100
+#define DREGEX_MAX_MATCHES 100
+#define DREGEX_MAX_CPU_SECONDS 1
+
+struct dregex_code {
+       pool_t pool;
+
+       pcre2_compile_context *cctx;
+       pcre2_general_context *gctx;
+       pcre2_match_context *mctx;
+       pcre2_code *pat;
+
+       struct cpu_limit *climit;
+
+       unsigned int max_depth;
+       unsigned int max_cpu_seconds;
+       unsigned int max_capture_groups;
+
+       enum dregex_flags flags;
+};
+
+static void *dregex_code_int_malloc(size_t amt, void *_ctx)
+{
+       struct dregex_code *ctx = _ctx;
+       return p_malloc(ctx->pool, amt);
+}
+
+static void dregex_code_int_free(void *ptr, void *_ctx)
+{
+       struct dregex_code *ctx = _ctx;
+       p_free(ctx->pool, ptr);
+}
+
+static int dregex_code_callout(pcre2_callout_block *block ATTR_UNUSED, void *ctx)
+{
+       struct dregex_code *code = ctx;
+       if (cpu_limit_exceeded(code->climit))
+               return PCRE2_ERROR_PATTERN_TOO_COMPLICATED;
+       return 0;
+}
+
+static int
+dregex_code_substitute_callout(pcre2_substitute_callout_block *block ATTR_UNUSED, void *ctx)
+{
+       return dregex_code_callout(NULL, ctx);
+}
+
+static int dregex_code_guard(uint depth, void *ctx)
+{
+       struct dregex_code *code = ctx;
+       if (code->max_depth < depth)
+               return PCRE2_ERROR_DEPTHLIMIT;
+       return 0;
+}
+
+static void dregex_code_init(struct dregex_code *code)
+{
+       code->gctx = pcre2_general_context_create(dregex_code_int_malloc,
+                                                 dregex_code_int_free, code);
+       code->cctx = pcre2_compile_context_create(code->gctx);
+       code->mctx = pcre2_match_context_create(code->gctx);
+
+       pcre2_set_compile_recursion_guard(code->cctx, dregex_code_guard, code);
+       /* these are used to ensure that CPU time isn't exceeded */
+       pcre2_set_callout(code->mctx, dregex_code_callout, code);
+       pcre2_set_substitute_callout(code->mctx, dregex_code_substitute_callout, code);
+
+       /* Set some limits */
+       pcre2_set_match_limit(code->mctx, code->max_capture_groups);
+       pcre2_set_depth_limit(code->mctx, code->max_depth);
+}
+
+struct dregex_code *dregex_code_create_params(const struct dregex_params *params)
+{
+       pool_t pool = pool_allocfree_create("regex pool");
+       struct dregex_code *code = p_new(pool, struct dregex_code, 1);
+       code->pool = pool;
+       code->max_capture_groups = params->max_capture_groups;
+       code->max_cpu_seconds = params->max_cpu_seconds;
+       code->max_depth = params->max_depth;
+       dregex_code_init(code);
+       return code;
+}
+
+static const struct dregex_params default_params = {
+       .max_depth = DREGEX_MAX_DEPTH,
+       .max_cpu_seconds = DREGEX_MAX_CPU_SECONDS,
+       .max_capture_groups = DREGEX_MAX_MATCHES,
+};
+
+struct dregex_code *dregex_code_create(void)
+{
+       struct dregex_code *code = dregex_code_create_params(&default_params);
+       dregex_code_init(code);
+       return code;
+}
+
+static const PCRE2_SPTR empty_str = U"";
+
+/* Convert input into unichars */
+static int convert_to_sptr(const char *input, PCRE2_SPTR *out_r, PCRE2_SIZE *len_r,
+                          bool refuse_non_ascii)
+{
+       if (*input == '\0') {
+               *len_r = 0;
+               *out_r = empty_str;
+       }
+       ARRAY_TYPE(unichars) chars;
+       t_array_init(&chars, 128);
+       if (refuse_non_ascii) {
+               /* treat everything as ascii */
+               for (; *input != '\0'; input++) {
+                       unichar_t chr = (unsigned char)*input;
+                       array_push_back(&chars, &chr);
+               }
+       } else if (uni_utf8_to_ucs4(input, &chars) < 0)
+               return -1;
+       *len_r = array_count(&chars);
+       if (*len_r == 0)
+               *out_r = empty_str;
+       else
+               *out_r = array_idx(&chars, 0);
+       return 0;
+}
+
+/* Handle error */
+static int handle_error(int ret, const char *func, const char **error_r)
+{
+       PCRE2_UCHAR buf[256];
+       if (ret == PCRE2_ERROR_NOMEMORY)
+               i_fatal_status(FATAL_OUTOFMEM, "%s(): Out of memory", func);
+       int rc = pcre2_get_error_message(ret, buf, sizeof(buf));
+               /* Ignore, the error didn't fit to buffer */
+       if (rc == PCRE2_ERROR_BADDATA) {
+               *error_r = t_strdup_printf("Unknown error %d occured", ret);
+       } else if (rc < 0) {
+               *error_r = t_strdup_printf("Unknown error %d occured while handling %d",
+                                          rc, ret);
+       } else {
+               /* we are ignoring PCRE2_ERROR_NOMEMORY here because it
+                * likely means the output did not fit in 256 characters. */
+               buffer_t *output = t_buffer_create(rc);
+               uni_ucs4_to_utf8(buf, rc, output);
+               *error_r = str_c(output);
+       }
+       return -1;
+}
+#define handle_error(ret, error_r) handle_error((ret), __func__, (error_r))
+
+int dregex_code_compile(struct dregex_code *code, const char *pattern,
+                       enum dregex_flags flags, const char **error_r)
+{
+       i_assert(code != NULL);
+       i_assert(pattern != NULL);
+       int errcode;
+       PCRE2_SIZE erroffset;
+
+       if (code->pat != NULL) {
+               pcre2_code_free(code->pat);
+               code->pat = NULL;
+               code->flags = 0;
+       }
+
+       uint options = PCRE2_AUTO_CALLOUT |
+               PCRE2_NEVER_BACKSLASH_C | PCRE2_NO_UTF_CHECK;
+
+       if (HAS_ALL_BITS(flags, DREGEX_ICASE))
+               options |= PCRE2_CASELESS;
+       if (HAS_ALL_BITS(flags, DREGEX_NOSUB))
+               options |= PCRE2_NO_AUTO_CAPTURE;
+       if (HAS_ALL_BITS(flags, DREGEX_NEWLINE))
+               options |= PCRE2_MULTILINE;
+       if (HAS_ALL_BITS(flags, DREGEX_ANCHORED))
+               options |= PCRE2_ANCHORED;
+       if (HAS_ALL_BITS(flags, DREGEX_EXTENDED))
+               options |= PCRE2_EXTENDED;
+
+       bool refuse_non_ascii = HAS_ALL_BITS(flags, DREGEX_ASCII_ONLY);
+       code->flags = flags;
+
+       /* Use Unicode properties for character matching */
+       if (!refuse_non_ascii)
+               options |= (PCRE2_UCP | PCRE2_UTF);
+       else
+               options |= PCRE2_NEVER_UTF;
+
+       T_BEGIN {
+               PCRE2_SIZE slen;
+               PCRE2_SPTR32 pattern32;
+               if (convert_to_sptr(pattern, &pattern32, &slen, refuse_non_ascii) < 0) {
+                       errcode = PCRE2_ERROR_BADDATA;
+                       code->pat = NULL;
+               } else {
+                       code->pat = pcre2_compile(pattern32, slen, options, &errcode,
+                                                 &erroffset, code->cctx);
+               }
+       } T_END;
+
+       i_assert(code->pat != NULL || errcode != 0);
+
+       if (code->pat == NULL)
+               return handle_error(errcode, error_r);
+
+       return 0;
+}
+
+void dregex_code_export(const struct dregex_code *code, buffer_t *buffer)
+{
+       PCRE2_SIZE size;
+       uint8_t *bytes;
+
+       const pcre2_code *codes[] = {
+               code->pat,
+       };
+
+       int ret = pcre2_serialize_encode(codes, N_ELEMENTS(codes), &bytes,
+                                        &size, code->gctx);
+       if (ret < 0) {
+               const char *error;
+               (void)handle_error(ret, &error);
+               i_panic("BUG: dregex_code_export(): %s", error);
+       }
+
+       /* There must be only one pattern */
+       i_assert(ret == 1);
+
+       buffer_append(buffer, bytes, size);
+       pcre2_serialize_free(bytes);
+}
+
+int dregex_code_import(struct dregex_code *code, const buffer_t *buffer,
+                      const char **error_r)
+{
+       int ret = pcre2_serialize_decode(&code->pat, 1, buffer->data,
+                                        code->gctx);
+       if (ret < 0)
+               return handle_error(ret, error_r);
+       i_assert(ret > 0);
+
+       return 0;
+}
+
+static const char *empty_match_str = "";
+
+static void extract_matches(uint count, pcre2_match_data *mdata,
+                           bool skip_empty, ARRAY_TYPE(const_string) *groups_r)
+{
+       /* we don't actually want matches */
+       if (groups_r == NULL)
+               return;
+       for (uint i = 0; i < count; i++) {
+               PCRE2_UCHAR32 *buf;
+               PCRE2_SIZE bsize;
+               int rc = pcre2_substring_length_bynumber(mdata, i, &bsize);
+               if (rc == PCRE2_ERROR_NOSUBSTRING)
+                       break;
+               else if (rc == PCRE2_ERROR_UNSET) {
+                       if (!skip_empty)
+                               array_push_back(groups_r, &empty_match_str);
+                       continue;
+               } else if (rc == PCRE2_ERROR_UNAVAILABLE)
+                       continue;
+               pcre2_substring_get_bynumber(mdata, i, &buf, &bsize);
+               buffer_t *output = t_buffer_create(bsize);
+               uni_ucs4_to_utf8(buf, bsize, output);
+               const char *substr = str_c(output);
+               array_push_back(groups_r, &substr);
+       }
+}
+
+static int dregex_code_match_int(struct dregex_code *code, const char *subject,
+                                pcre2_match_data *mdata, const char **error_r)
+{
+       i_assert(code != NULL);
+       i_assert(code->pat != NULL);
+       i_assert(subject != NULL);
+
+       PCRE2_SIZE slen;
+       PCRE2_SPTR subject32;
+
+       bool refuse_non_ascii = HAS_ALL_BITS(code->flags, DREGEX_ASCII_ONLY);
+       if (convert_to_sptr(subject, &subject32, &slen, refuse_non_ascii) < 0)
+               return handle_error(PCRE2_ERROR_BADDATA, error_r);
+
+       /* Empty string is not a match */
+       uint options = PCRE2_NOTEMPTY;
+
+       if (HAS_ALL_BITS(code->flags, DREGEX_NOTBOL))
+               options |= PCRE2_NOTBOL;
+       if (HAS_ALL_BITS(code->flags, DREGEX_NOTEOL))
+               options |= PCRE2_NOTEOL;
+       if (HAS_ALL_BITS(code->flags, DREGEX_ANCHORED))
+               options |= PCRE2_ANCHORED;
+
+       code->climit = cpu_limit_init(code->max_cpu_seconds, CPU_LIMIT_TYPE_ALL);
+       int ret = pcre2_match(code->pat, subject32, slen, 0, options,
+                             mdata, code->mctx);
+       cpu_limit_deinit(&code->climit);
+
+       if (ret == PCRE2_ERROR_NOMATCH) {
+               /* did not match */
+               ret = 0;
+       } else if (ret < 0) {
+               return handle_error(ret, error_r);
+       }
+
+       return ret;
+}
+
+int dregex_code_match_groups(struct dregex_code *code, const char *subject,
+                            ARRAY_TYPE(const_string) *groups_r, const char **error_r)
+{
+       i_assert(code != NULL);
+       i_assert(code->pat != NULL);
+       int ret;
+
+       T_BEGIN {
+               pcre2_match_data *mdata =
+                       pcre2_match_data_create_from_pattern(code->pat, code->gctx);
+               ret = dregex_code_match_int(code, subject, mdata, error_r);
+               if (ret > 1) {
+                       bool skip_empty = HAS_ALL_BITS(code->flags, DREGEX_NO_EMPTY_SUB);
+                       /* ret is number of groups */
+                       extract_matches((uint32_t)ret, mdata, skip_empty, groups_r);
+                       ret = 1;
+               }
+       } T_END_PASS_STR_IF(ret < 0, error_r);
+       return ret;
+}
+
+int dregex_code_match(struct dregex_code *code, const char *subject,
+                     const char **error_r)
+{
+       return dregex_code_match_groups(code, subject, NULL, error_r);
+}
+
+int dregex_code_replace_full(struct dregex_code *code,
+                            const char *subject, size_t startoffset,
+                            const char *replacement, string_t *result_r,
+                            enum dregex_flags flags, const char **error_r)
+{
+       i_assert(code != NULL);
+       i_assert(code->pat != NULL);
+       i_assert(subject != NULL);
+       i_assert(replacement != NULL);
+       i_assert(result_r != NULL);
+
+       uint options = PCRE2_NOTEMPTY;
+       if (HAS_ALL_BITS(flags, PCRE2_ANCHORED))
+               options |= PCRE2_ANCHORED;
+       if (HAS_ALL_BITS(flags, DREGEX_REPLACE_ALL))
+               options |= PCRE2_SUBSTITUTE_GLOBAL;
+       if (HAS_ALL_BITS(flags, DREGEX_REPLACE_LITERAL))
+               options |= PCRE2_SUBSTITUTE_LITERAL;
+
+       PCRE2_UCHAR *result32 = U"";
+       PCRE2_SIZE result_len = 0;
+
+       int ret;
+       bool refuse_non_ascii = HAS_ALL_BITS(flags, DREGEX_ASCII_ONLY) ||
+                               HAS_ALL_BITS(code->flags, DREGEX_ASCII_ONLY);
+
+       T_BEGIN do {
+               PCRE2_SIZE slen;
+               PCRE2_SPTR subject32;
+               PCRE2_SIZE rlen;
+               PCRE2_SPTR replacement32;
+
+               if (convert_to_sptr(subject, &subject32, &slen, refuse_non_ascii) < 0 ||
+                   convert_to_sptr(replacement, &replacement32, &rlen, refuse_non_ascii) < 0) {
+                       ret = PCRE2_ERROR_BADDATA;
+                       break;
+               }
+
+               pcre2_match_data *mdata =
+                       pcre2_match_data_create_from_pattern(code->pat, code->gctx);
+
+               code->climit = cpu_limit_init(code->max_cpu_seconds,
+                                             CPU_LIMIT_TYPE_ALL);
+               ret = pcre2_substitute(code->pat, subject32, slen, startoffset,
+                                      options|PCRE2_SUBSTITUTE_OVERFLOW_LENGTH,
+                                      mdata, code->mctx, replacement32, rlen,
+                                      result32, &result_len);
+               cpu_limit_deinit(&code->climit);
+               /* Ignore NOMEMORY error here, it's because we asked how long
+                  the result would be. */
+               if (ret != PCRE2_ERROR_NOMEMORY && ret < 0) {
+                       pcre2_match_data_free(mdata);
+                       break;
+               }
+
+               if (result_len > 0)
+                       result32 = t_new(PCRE2_UCHAR, result_len);
+
+               /* Run it again as we know the buffer size now */
+               code->climit = cpu_limit_init(code->max_cpu_seconds,
+                                             CPU_LIMIT_TYPE_ALL);
+               ret = pcre2_substitute(code->pat, subject32, slen, startoffset, options,
+                                      mdata, code->mctx, replacement32, rlen,
+                                      result32, &result_len);
+               cpu_limit_deinit(&code->climit);
+               pcre2_match_data_free(mdata);
+       } while(0); T_END;
+
+       if (ret < 0)
+               return handle_error(ret, error_r);
+       else if (ret > 0)
+               uni_ucs4_to_utf8(result32, result_len, result_r);
+
+       return ret > 0 ? 1 : 0;
+}
+
+int dregex_code_replace(struct dregex_code *code, const char *subject,
+                       const char *replacement, string_t *result_r,
+                       enum dregex_flags flags, const char **error_r)
+{
+       return dregex_code_replace_full(code, subject, 0, replacement, result_r,
+                                       flags, error_r);
+}
+
+void dregex_code_free(struct dregex_code **_code)
+{
+       struct dregex_code *code = *_code;
+       *_code = NULL;
+       if (code == NULL)
+               return;
+
+       if (code->pat != NULL)
+               pcre2_code_free(code->pat);
+       pcre2_match_context_free(code->mctx);
+       pcre2_compile_context_free(code->cctx);
+       pcre2_general_context_free(code->gctx);
+       pool_unref(&code->pool);
+}
+
+int dregex_match_groups(const char *pattern, const char *subject, enum dregex_flags flags,
+                       ARRAY_TYPE(const_string) *groups_r, const char **error_r)
+{
+       struct dregex_code *code = dregex_code_create();
+       int ret;
+
+       T_BEGIN {
+               if (dregex_code_compile(code, pattern, flags, error_r) < 0)
+                       ret = -1;
+               else {
+                       ret = dregex_code_match_groups(code, subject, groups_r,
+                                                      error_r);
+               }
+       } T_END_PASS_STR_IF(ret < 0, error_r);
+       dregex_code_free(&code);
+
+       return ret;
+}
+
+int dregex_match(const char *pattern, const char *subject, enum dregex_flags flags,
+                const char **error_r)
+{
+       return dregex_match_groups(pattern, subject, flags, NULL, error_r);
+}
+
+int dregex_replace(const char *pattern, const char *subject, const char *replace,
+                  string_t *result_r, enum dregex_flags flags,
+                  const char **error_r)
+{
+       struct dregex_code *code = dregex_code_create();
+       int ret;
+
+       T_BEGIN {
+               ret = dregex_code_compile(code, pattern, flags, error_r);
+       } T_END_PASS_STR_IF(ret < 0, error_r);
+
+       if (ret >= 0) {
+               ret = dregex_code_replace(code, subject, replace, result_r,
+                                         flags, error_r);
+       }
+
+       dregex_code_free(&code);
+
+       return ret;
+}
+
+#endif
diff --git a/src/lib-regex/test-regex.c b/src/lib-regex/test-regex.c
new file mode 100644 (file)
index 0000000..ef2d3bd
--- /dev/null
@@ -0,0 +1,306 @@
+/* Copyright (C) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "test-common.h"
+#include "array.h"
+#include "str.h"
+#include "dregex.h"
+
+#ifdef HAVE_LIBPCRE
+
+static const bool debug = FALSE;
+
+struct test_case {
+       const char *subject;
+       const char *pattern;
+       const char *replacement;
+       const char *result;
+       const char *error;
+       enum dregex_flags flags;
+       int compile_ret;
+       int match_ret;
+};
+
+static void run_match_tests(const struct test_case *cases)
+{
+       unsigned int idx;
+       struct dregex_code *code = dregex_code_create();
+
+       for(idx = 0; cases[idx].pattern != NULL; idx++) {
+               const char *error = NULL;
+               const struct test_case *test = &cases[idx];
+
+               if (debug) {
+                       i_debug("pattern = %s, subject = %s", test->pattern,
+                               test->subject);
+               }
+
+               /* compile pattern */
+               int ret = dregex_code_compile(code, test->pattern, test->flags,
+                                            &error);
+               test_assert_cmp_idx(test->compile_ret, ==, ret, idx);
+               if (test->compile_ret < 0) {
+                       test_assert_strcmp_idx(test->error, error, idx);
+                       continue;
+               } else if (ret < 0) {
+                       error = t_strdup_printf("Unexpected error: %s", error);
+                       test_assert_failed_idx(error, __FILE__, __LINE__, idx);
+                       continue;
+               }
+
+               ret = dregex_code_match(code, test->subject, &error);
+
+               test_assert_cmp_idx(test->match_ret, ==, ret, idx);
+               if (test->match_ret < 0)
+                       test_assert_strcmp_idx(test->error, error, idx);
+               else if (ret < 0) {
+                       error = t_strdup_printf("Unexpected error: %s", error);
+                       test_assert_failed_idx(error, __FILE__, __LINE__, idx);
+                       continue;
+               }
+       }
+       dregex_code_free(&code);
+}
+
+#define MATCH_CASE_FULL(pat, sub, err, cret, mret) \
+       { \
+               .pattern = (pat), \
+               .subject = (sub), \
+               .replacement = NULL, \
+               .result = NULL, \
+               .error = (err), \
+               .compile_ret = (cret), \
+               .match_ret = (mret) \
+       }
+#define MATCH_CASE(pattern, subject) MATCH_CASE_FULL(pattern, subject, NULL, 0, 1)
+#define MATCH_CASE_END { .pattern = NULL }
+
+#define STR(x) x
+#define REP(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x) STR(x)
+#define REP10(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x) REP(x)
+
+static void test_dregex_match(void)
+{
+       const struct test_case cases[] = {
+               /* simple test case */
+               MATCH_CASE(".*", "hello world"),
+               /* .* matches empty string */
+               MATCH_CASE_FULL(".*", "", NULL, 0, 0),
+               /* but empty string does not match empty string */
+               MATCH_CASE_FULL("", "", NULL, 0, 0),
+               /* Match any single character except newline. */
+               MATCH_CASE(".", "a"),
+               MATCH_CASE_FULL(".", "\n", NULL, 0, 0),
+               /* Bracket expression.  Match any one of the enclosed
+                  characters.  A hypen (-) indicates a range of
+                  consecutive characters. */
+               MATCH_CASE("[a-z]", "a"),
+               MATCH_CASE_FULL("[a-z]", "A", NULL, 0, 0),
+               /* Negated bracket expression. */
+               MATCH_CASE("[^a-z]", "A"),
+               MATCH_CASE_FULL("[^a-z]", "a", NULL, 0, 0),
+               /* Character class */
+               MATCH_CASE("^[[:alnum:]]+$", "abc123"),
+               MATCH_CASE_FULL("^[[^:alnum:]]+$", "abc123", NULL, 0, 0),
+               /* Unicode properties */
+               MATCH_CASE("^\\p{L}$", "\xc3\xab"),
+               MATCH_CASE("^\\pL$", "\xc3\xab"),
+               /* Quantifiers */
+               MATCH_CASE("^.$", "h"),
+               MATCH_CASE("^.{2}$", "he"),
+               MATCH_CASE("^.{2,3}$", "he"),
+               MATCH_CASE("^.{2,3}$", "hel"),
+               MATCH_CASE("^.+$", "hello"),
+               MATCH_CASE_FULL("^.+$", "", NULL, 0, 0),
+               /* Alternation and grouping */
+               MATCH_CASE("^(hello|world)$", "hello"),
+               MATCH_CASE("^(hello|world)$", "world"),
+               MATCH_CASE_FULL("^(hello|world)$", "hi", NULL, 0, 0),
+               /* test that we can find 'mojiretsu' (test string) from
+                 'Kore wa tesuto mojiretsudesu.' (this is a test string) */
+               MATCH_CASE(
+                       "\xe6\x96\x87\xe5\xad\x97\xe5\x88\x97",
+                       "\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xaf\xe3\x83\x86\xe3"
+                       "\x82\xb9\xe3\x83\x88\xe6\x96\x87\xe5\xad\x97\xe5\x88"
+                       "\x97\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82"
+               ),
+               /* test that we can find <U+1F60A> from <U+1F600><U+1F60A> */
+               MATCH_CASE("\xef\x85\xa0""A", "\xef\x85\xa0""0\xef\x85\xa0""A"),
+               /* binary matching */
+               {
+                       .pattern = "\xef\x85\xa0""A",
+                       .subject = "\xef\x85\xa0""0\xef\x85\xa0""A",
+                       .error = "",
+                       .flags = DREGEX_ASCII_ONLY,
+                       .compile_ret = 0,
+                       .match_ret = 1,
+               },
+               {
+                       .pattern = ".*",
+                       .subject = "\xef\x85\xa0""0\xef\x85\xa0""A",
+                       .error = "",
+                       .flags = DREGEX_ASCII_ONLY,
+                       .compile_ret = 0,
+                       .match_ret = 1,
+               },
+               /* invalid utf-8 */
+               MATCH_CASE_FULL(".*", "\xc2\xc2", "bad data value", 0, -1),
+               /* two evil patterns */
+               MATCH_CASE_FULL(
+                       "^([a-zA-Z0-9])(([\\-.]|[_]+)?([a-zA-Z0-9]+))*(@)"
+                       "{1}[a-z0-9]+[.]{1}(([a-z]{2,3})|([a-z]{2,3}[.]"
+                       "{1}[a-z]{2,3}))$",
+                       "thisisabstractly.andtotally.long.email@"
+                       REP10("a") "." REP10("a") "." REP10("a")
+                       ".has",
+                       "match limit exceeded",
+                       0,
+                       -1
+               ),
+               MATCH_CASE_FULL(
+                       "(a|a?)+",
+                       REP10("a") REP10("a"),
+                       "match limit exceeded",
+                       0,
+                       -1
+               ),
+               /* IEEE.1003-2.1992 */
+               MATCH_CASE("me(\\+.*)?@company\\.com",
+                       "me+hello@company.com"),
+               MATCH_CASE("^[^[:lower:]]+$", "HELLO"),
+               MATCH_CASE_FULL(
+                       "^[^[:lower:]]+$",
+                       "hello",
+                       NULL,
+                       0,
+                       0
+               ),
+               MATCH_CASE("<(.*)@", "<simple-list@test.invalid>"),
+               MATCH_CASE("^\\[(.*)\\] (.*)$", "[acme-users] [fwd]: hello, world"),
+               MATCH_CASE_END
+       };
+
+       test_begin("matching");
+
+       run_match_tests(cases);
+
+       test_end();
+}
+
+static void run_replace_tests(const struct test_case *cases)
+{
+       unsigned int idx;
+       struct dregex_code *code = dregex_code_create();
+       string_t *dest = t_str_new(32);
+
+       for(idx = 0; cases[idx].pattern != NULL; idx++) {
+               const char *error = NULL;
+               const struct test_case *test = &cases[idx];
+               str_truncate(dest, 0);
+
+               if (debug) {
+                       i_debug("pattern = %s, subject = %s, "
+                               "replacement = %s, result = %s",
+                               test->pattern, test->subject,
+                               test->replacement, test->result);
+               }
+
+               /* compile pattern */
+               int ret = dregex_code_compile(code, test->pattern, test->flags,
+                                            &error);
+               test_assert_cmp_idx(test->compile_ret, ==, ret, idx);
+               if (test->compile_ret < 0) {
+                       test_assert_strcmp_idx(test->error, error, idx);
+                       continue;
+               } else if (ret < 0) {
+                       error = t_strdup_printf("Unexpected error: %s", error);
+                       test_assert_failed_idx(error, __FILE__, __LINE__, idx);
+                       continue;
+               }
+
+               ret = dregex_code_replace(code, test->subject, test->replacement,
+                                        dest, test->flags, &error);
+
+               test_assert_cmp_idx(test->match_ret, ==, ret, idx);
+               if (test->match_ret < 0) {
+                       test_assert_strcmp_idx(test->error, error, idx);
+                       continue;
+               } else if (ret < 0) {
+                       error = t_strdup_printf("Unexpected error: %s", error);
+                       test_assert_failed_idx(error, __FILE__, __LINE__, idx);
+                       continue;
+               }
+               test_assert_strcmp_idx(test->result, str_c(dest), idx);
+       }
+       dregex_code_free(&code);
+}
+
+#define REP_CASE_FULL(pat, sub, rep, res, err, cret, mret) \
+       { \
+               .pattern = (pat), \
+               .subject = (sub), \
+               .replacement = (rep), \
+               .result = (res), \
+               .error = (err), \
+               .compile_ret = (cret), \
+               .match_ret = (mret) \
+       }
+#define REP_CASE(pattern, subject, replacement, result) \
+       REP_CASE_FULL(pattern, subject, replacement, result, NULL, 0, 1)
+#define REP_CASE_END { .pattern = NULL }
+
+static void test_dregex_replace(void)
+{
+       const struct test_case cases[] = {
+               /* simple replacement */
+               REP_CASE(".*", "hello world", "world hello", "world hello"),
+               /* simple swap */
+               REP_CASE("(.*) (.*)", "hello world", "$2 $1", "world hello"),
+               /* partial replace */
+               REP_CASE("hello .*", "hello world", "$0", "hello world"),
+               /* simple utf-8 test,
+                * '<U+1F600> <U+1F60A>' to '<U+1F60A> <U+1F600>' */
+               REP_CASE(
+                       "(.*) (.*)",
+                       "\xef\x85\xa0""0 \xef\x85\xa0""A",
+                       "$2 $1",
+                       "\xef\x85\xa0""A \xef\x85\xa0""0"
+               ),
+               /* Invalid back reference */
+               REP_CASE_FULL(
+                       "hello .*",
+                       "hello world",
+                       "$5",
+                       "",
+                       "unknown substring",
+                       0,
+                       -1
+               ),
+               REP_CASE_END
+       };
+
+       test_begin("replacing");
+
+       run_replace_tests(cases);
+
+       test_end();
+}
+
+int main(void)
+{
+       void (*const tests[])(void) = {
+               test_dregex_match,
+               test_dregex_replace,
+               NULL
+       };
+
+       return test_run(tests);
+}
+
+#else
+
+int main(void) {
+       return 0;
+}
+
+#endif
index cc1b7d71e5662dcc1e248701b8abcdf96601ae27..1d5220222eeeaa97fdbad6dd1bc5ff676b8d4875 100644 (file)
@@ -133,6 +133,8 @@ event_call_callbacks(struct event *event, enum event_callback_type type,
 
        if (event->disable_callbacks)
                return TRUE;
+       if (!array_is_created(&event_handlers))
+               return TRUE;
 
        array_foreach_elem(&event_handlers, callback) {
                bool ret;