From: Vsevolod Stakhov Date: Fri, 17 Oct 2025 14:49:38 +0000 (+0100) Subject: [Test] Use real corpus and filter small files X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2e6907dc040e58b5586692271dd827174b8aa475;p=thirdparty%2Frspamd.git [Test] Use real corpus and filter small files - Mount data/corpus in docker instead of functional/messages - Filter emails by minimum size (200 bytes) for adequate tokens - Remove CORPUS_DIR override in workflow (auto-detected) --- diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index a96c01d144..4b0c0b893a 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -220,19 +220,18 @@ jobs: export PROXY_PORT=50004 export PASSWORD=q1 export TEST_PROXY=true - export CORPUS_DIR=data/corpus # Verify corpus exists - if [ ! -d "$CORPUS_DIR/spam" ] || [ ! -d "$CORPUS_DIR/ham" ]; then + if [ ! -d "data/corpus/spam" ] || [ ! -d "data/corpus/ham" ]; then echo "ERROR: Corpus directories not found" - echo "Expected: $CORPUS_DIR/spam and $CORPUS_DIR/ham" + echo "Expected: data/corpus/spam and data/corpus/ham" ls -la data/ ls -la data/corpus/ || true exit 1 fi - echo "Using corpus: $CORPUS_DIR" - ls -lh "$CORPUS_DIR" + echo "Corpus downloaded successfully:" + ls -lh data/corpus/ ./scripts/integration-test.sh diff --git a/test/integration/docker-compose.yml b/test/integration/docker-compose.yml index 4109cd808c..63a4a064e9 100644 --- a/test/integration/docker-compose.yml +++ b/test/integration/docker-compose.yml @@ -33,7 +33,7 @@ services: - ./configs/statistic.conf:/etc/rspamd/local.d/statistic.conf:ro - ./configs/lsan.supp:/etc/rspamd/lsan.supp:ro - ./data:/data - - ../functional/messages:/corpus:ro + - ./data/corpus:/corpus:ro - rspamd-db:/var/lib/rspamd env_file: - .env.keys diff --git a/test/integration/scripts/integration-test.sh b/test/integration/scripts/integration-test.sh index c8c7806758..076e236afd 100755 --- a/test/integration/scripts/integration-test.sh +++ b/test/integration/scripts/integration-test.sh @@ -54,17 +54,20 @@ fi echo "✓ Rspamd is running" echo "" -# Find all email files +# Find all email files, filtering by size (min 200 bytes to have enough tokens) echo "Finding email files in $CORPUS_DIR..." -EMAIL_FILES=($(find "$CORPUS_DIR" -type f \( -name "*.eml" -o -name "*.msg" -o -name "*.txt" \))) +MIN_SIZE=200 # bytes, roughly 11+ tokens for Bayes + +# Find files with minimum size +mapfile -t EMAIL_FILES < <(find "$CORPUS_DIR" -type f -size +${MIN_SIZE}c \( -name "*.eml" -o -name "*.msg" -o -name "*.txt" \)) TOTAL_EMAILS=${#EMAIL_FILES[@]} if [ $TOTAL_EMAILS -eq 0 ]; then - echo "ERROR: No email files found in $CORPUS_DIR" + echo "ERROR: No email files found in $CORPUS_DIR (min size: $MIN_SIZE bytes)" exit 1 fi -echo "Found $TOTAL_EMAILS email files" +echo "Found $TOTAL_EMAILS email files (filtered by min size: $MIN_SIZE bytes)" echo "" # Calculate split sizes (using bash arithmetic)