export PROXY_PORT=50004
export PASSWORD=q1
export TEST_PROXY=true
- export CORPUS_DIR=data/corpus
# Verify corpus exists
- if [ ! -d "$CORPUS_DIR/spam" ] || [ ! -d "$CORPUS_DIR/ham" ]; then
+ if [ ! -d "data/corpus/spam" ] || [ ! -d "data/corpus/ham" ]; then
echo "ERROR: Corpus directories not found"
- echo "Expected: $CORPUS_DIR/spam and $CORPUS_DIR/ham"
+ echo "Expected: data/corpus/spam and data/corpus/ham"
ls -la data/
ls -la data/corpus/ || true
exit 1
fi
- echo "Using corpus: $CORPUS_DIR"
- ls -lh "$CORPUS_DIR"
+ echo "Corpus downloaded successfully:"
+ ls -lh data/corpus/
./scripts/integration-test.sh
- ./configs/statistic.conf:/etc/rspamd/local.d/statistic.conf:ro
- ./configs/lsan.supp:/etc/rspamd/lsan.supp:ro
- ./data:/data
- - ../functional/messages:/corpus:ro
+ - ./data/corpus:/corpus:ro
- rspamd-db:/var/lib/rspamd
env_file:
- .env.keys
echo "✓ Rspamd is running"
echo ""
-# Find all email files
+# Find all email files, filtering by size (min 200 bytes to have enough tokens)
echo "Finding email files in $CORPUS_DIR..."
-EMAIL_FILES=($(find "$CORPUS_DIR" -type f \( -name "*.eml" -o -name "*.msg" -o -name "*.txt" \)))
+MIN_SIZE=200 # bytes, roughly 11+ tokens for Bayes
+
+# Find files with minimum size
+mapfile -t EMAIL_FILES < <(find "$CORPUS_DIR" -type f -size +${MIN_SIZE}c \( -name "*.eml" -o -name "*.msg" -o -name "*.txt" \))
TOTAL_EMAILS=${#EMAIL_FILES[@]}
if [ $TOTAL_EMAILS -eq 0 ]; then
- echo "ERROR: No email files found in $CORPUS_DIR"
+ echo "ERROR: No email files found in $CORPUS_DIR (min size: $MIN_SIZE bytes)"
exit 1
fi
-echo "Found $TOTAL_EMAILS email files"
+echo "Found $TOTAL_EMAILS email files (filtered by min size: $MIN_SIZE bytes)"
echo ""
# Calculate split sizes (using bash arithmetic)