]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Test] Use real corpus and filter small files
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 17 Oct 2025 14:49:38 +0000 (15:49 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 17 Oct 2025 14:49:38 +0000 (15:49 +0100)
- Mount data/corpus in docker instead of functional/messages
- Filter emails by minimum size (200 bytes) for adequate tokens
- Remove CORPUS_DIR override in workflow (auto-detected)

.github/workflows/integration-test.yml
test/integration/docker-compose.yml
test/integration/scripts/integration-test.sh

index a96c01d1449c0be22862acb5b3428c3dbbcd4a02..4b0c0b893a3759ae4bdda617a864e572adb01077 100644 (file)
@@ -220,19 +220,18 @@ jobs:
           export PROXY_PORT=50004
           export PASSWORD=q1
           export TEST_PROXY=true
-          export CORPUS_DIR=data/corpus
 
           # Verify corpus exists
-          if [ ! -d "$CORPUS_DIR/spam" ] || [ ! -d "$CORPUS_DIR/ham" ]; then
+          if [ ! -d "data/corpus/spam" ] || [ ! -d "data/corpus/ham" ]; then
             echo "ERROR: Corpus directories not found"
-            echo "Expected: $CORPUS_DIR/spam and $CORPUS_DIR/ham"
+            echo "Expected: data/corpus/spam and data/corpus/ham"
             ls -la data/
             ls -la data/corpus/ || true
             exit 1
           fi
 
-          echo "Using corpus: $CORPUS_DIR"
-          ls -lh "$CORPUS_DIR"
+          echo "Corpus downloaded successfully:"
+          ls -lh data/corpus/
 
           ./scripts/integration-test.sh
 
index 4109cd808c83b4b2b0e228f2ee438d73c7144e23..63a4a064e98674fe6391796351f37c530df7aa65 100644 (file)
@@ -33,7 +33,7 @@ services:
       - ./configs/statistic.conf:/etc/rspamd/local.d/statistic.conf:ro
       - ./configs/lsan.supp:/etc/rspamd/lsan.supp:ro
       - ./data:/data
-      - ../functional/messages:/corpus:ro
+      - ./data/corpus:/corpus:ro
       - rspamd-db:/var/lib/rspamd
     env_file:
       - .env.keys
index c8c7806758028592651a164053686f0fee076ee8..076e236afd9fbef6e3fab7b80e8460181a2db954 100755 (executable)
@@ -54,17 +54,20 @@ fi
 echo "✓ Rspamd is running"
 echo ""
 
-# Find all email files
+# Find all email files, filtering by size (min 200 bytes to have enough tokens)
 echo "Finding email files in $CORPUS_DIR..."
-EMAIL_FILES=($(find "$CORPUS_DIR" -type f \( -name "*.eml" -o -name "*.msg" -o -name "*.txt" \)))
+MIN_SIZE=200  # bytes, roughly 11+ tokens for Bayes
+
+# Find files with minimum size
+mapfile -t EMAIL_FILES < <(find "$CORPUS_DIR" -type f -size +${MIN_SIZE}c \( -name "*.eml" -o -name "*.msg" -o -name "*.txt" \))
 TOTAL_EMAILS=${#EMAIL_FILES[@]}
 
 if [ $TOTAL_EMAILS -eq 0 ]; then
-    echo "ERROR: No email files found in $CORPUS_DIR"
+    echo "ERROR: No email files found in $CORPUS_DIR (min size: $MIN_SIZE bytes)"
     exit 1
 fi
 
-echo "Found $TOTAL_EMAILS email files"
+echo "Found $TOTAL_EMAILS email files (filtered by min size: $MIN_SIZE bytes)"
 echo ""
 
 # Calculate split sizes (using bash arithmetic)