]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Create an isolated API for external tokenizers
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 20 Jun 2025 15:50:49 +0000 (16:50 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 20 Jun 2025 15:50:49 +0000 (16:50 +0100)
src/libmime/message.c
src/libstat/tokenizers/custom_tokenizer.h
src/libstat/tokenizers/rspamd_tokenizer_types.h [new file with mode: 0644]

index bac67fb079bbe7d46f610af032d195dfc7bf4885..8442c80ac82a5597034fb4bbad3f57ac982f4f8a 100644 (file)
@@ -40,6 +40,7 @@
 #include "contrib/uthash/utlist.h"
 #include "contrib/t1ha/t1ha.h"
 #include "received.h"
+#define RSPAMD_TOKENIZER_INTERNAL
 #include "libstat/tokenizers/custom_tokenizer.h"
 
 #define GTUBE_SYMBOL "GTUBE"
index addf089200562812014714c56263f12e46debd87..bc173a1da4c2eb40ac5b659aa624b95d212a9aa6 100644 (file)
 #ifndef RSPAMD_CUSTOM_TOKENIZER_H
 #define RSPAMD_CUSTOM_TOKENIZER_H
 
+/* Check if we're being included by internal Rspamd code or external plugins */
+#ifdef RSPAMD_TOKENIZER_INTERNAL
+/* Internal Rspamd usage - use the full headers */
 #include "config.h"
 #include "ucl.h"
 #include "libserver/word.h"
+#else
+/* External plugin usage - use standalone types */
+#include "rspamd_tokenizer_types.h"
+/* Forward declaration for UCL object - plugins should include ucl.h if needed */
+typedef struct ucl_object_s ucl_object_t;
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,8 +37,7 @@ extern "C" {
 #define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1
 
 /**
- * Tokenization result - kvec of rspamd_word_t
- * Uses kvec to avoid exposing GLIB structures to external API
+ * Tokenization result - compatible with both internal and external usage
  */
 typedef rspamd_words_t rspamd_tokenizer_result_t;
 
diff --git a/src/libstat/tokenizers/rspamd_tokenizer_types.h b/src/libstat/tokenizers/rspamd_tokenizer_types.h
new file mode 100644 (file)
index 0000000..eb85182
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_TOKENIZER_TYPES_H
+#define RSPAMD_TOKENIZER_TYPES_H
+
+/*
+ * Standalone type definitions for custom tokenizers
+ * This header is completely self-contained and does not depend on any external libraries.
+ * Custom tokenizers should include only this header to get access to all necessary types.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Basic string token structure
+ */
+typedef struct rspamd_ftok {
+       size_t len;
+       const char *begin;
+} rspamd_ftok_t;
+
+/**
+ * Unicode string token structure
+ */
+typedef struct rspamd_ftok_unicode {
+       size_t len;
+       const uint32_t *begin;
+} rspamd_ftok_unicode_t;
+
+/* Word flags */
+#define RSPAMD_WORD_FLAG_TEXT (1u << 0u)
+#define RSPAMD_WORD_FLAG_META (1u << 1u)
+#define RSPAMD_WORD_FLAG_LUA_META (1u << 2u)
+#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3u)
+#define RSPAMD_WORD_FLAG_HEADER (1u << 4u)
+#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5u)
+#define RSPAMD_WORD_FLAG_UTF (1u << 6u)
+#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7u)
+#define RSPAMD_WORD_FLAG_STEMMED (1u << 8u)
+#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9u)
+#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10u)
+#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11u)
+#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12u)
+#define RSPAMD_WORD_FLAG_EMOJI (1u << 13u)
+
+/**
+ * Word structure
+ */
+typedef struct rspamd_word {
+       rspamd_ftok_t original;
+       rspamd_ftok_unicode_t unicode;
+       rspamd_ftok_t normalized;
+       rspamd_ftok_t stemmed;
+       unsigned int flags;
+} rspamd_word_t;
+
+/**
+ * Array of words
+ */
+typedef struct rspamd_words {
+       rspamd_word_t *a;
+       size_t n;
+       size_t m;
+} rspamd_words_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_TOKENIZER_TYPES_H */