--- /dev/null
+name: Dead Domain Cleanup
+
+on:
+ workflow_dispatch:
+ inputs:
+ sample_size:
+ description: 'Domains to sample per list'
+ required: false
+ default: '500'
+ create_issue:
+ description: 'Create issue with results'
+ required: false
+ type: boolean
+ default: true
+ schedule:
+ # Monthly on the 1st at 3 AM UTC
+ - cron: '0 3 1 * *'
+
+permissions:
+ contents: read
+ issues: write
+
+jobs:
+ check-dead-domains:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.13'
+ cache: 'pip'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -e .
+
+ - name: Check for dead domains
+ id: check
+ run: |
+ SAMPLE_SIZE="${{ github.event.inputs.sample_size || '500' }}"
+
+ python scripts/check-dead-domains.py \
+ --all \
+ --sample $SAMPLE_SIZE \
+ --output dead-domains.txt \
+ --verbose \
+ 2>&1 | tee check-output.txt
+
+ # Extract summary for issue
+ DEAD_COUNT=$(grep -oP 'Total dead domains: \K[\d,]+' check-output.txt || echo "0")
+ CHECKED_COUNT=$(grep -oP 'Total domains checked: \K[\d,]+' check-output.txt || echo "0")
+
+ echo "dead_count=$DEAD_COUNT" >> $GITHUB_OUTPUT
+ echo "checked_count=$CHECKED_COUNT" >> $GITHUB_OUTPUT
+
+ - name: Upload results
+ uses: actions/upload-artifact@v4
+ with:
+ name: dead-domain-report
+ path: |
+ dead-domains.txt
+ check-output.txt
+ retention-days: 90
+
+ - name: Create issue with results
+ if: ${{ github.event.inputs.create_issue != 'false' && steps.check.outputs.dead_count != '0' }}
+ uses: actions/github-script@v7
+ with:
+ script: |
+ const fs = require('fs');
+
+ const deadDomains = fs.readFileSync('dead-domains.txt', 'utf8');
+ const output = fs.readFileSync('check-output.txt', 'utf8');
+
+ // Extract summary section
+ const summaryMatch = output.match(/SUMMARY[\s\S]*$/);
+ const summary = summaryMatch ? summaryMatch[0] : 'See attached artifacts.';
+
+ const deadCount = '${{ steps.check.outputs.dead_count }}';
+ const checkedCount = '${{ steps.check.outputs.checked_count }}';
+
+ const body = `## Dead Domain Report
+
+ **Date:** ${new Date().toISOString().split('T')[0]}
+ **Domains Checked:** ${checkedCount}
+ **Dead Domains Found:** ${deadCount}
+
+ ### Summary
+
+ \`\`\`
+ ${summary}
+ \`\`\`
+
+ ### Dead Domains Sample
+
+ <details>
+ <summary>Click to expand (first 100 domains)</summary>
+
+ \`\`\`
+ ${deadDomains.split('\n').slice(0, 100).join('\n')}
+ \`\`\`
+
+ </details>
+
+ ### Action Items
+
+ - [ ] Review dead domains for potential removal
+ - [ ] Check if domains are intentionally parked/blocked
+ - [ ] Update lists as needed
+
+ ---
+ *This issue was automatically generated by the Dead Domain Cleanup workflow.*
+ `;
+
+ await github.rest.issues.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ title: `[Maintenance] Dead Domain Report - ${new Date().toISOString().split('T')[0]}`,
+ body: body,
+ labels: ['maintenance', 'automated']
+ });
--- /dev/null
+name: Release
+
+on:
+ workflow_dispatch:
+ inputs:
+ version:
+ description: 'Version tag (e.g., v2.1.0)'
+ required: true
+ type: string
+ prerelease:
+ description: 'Mark as pre-release'
+ required: false
+ type: boolean
+ default: false
+ schedule:
+ # Weekly release every Monday at 6 AM UTC
+ - cron: '0 6 * * 1'
+
+permissions:
+ contents: write
+
+jobs:
+ release:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 0 # Full history for changelog
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.13'
+ cache: 'pip'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -e .
+
+ - name: Determine version
+ id: version
+ run: |
+ if [ -n "${{ github.event.inputs.version }}" ]; then
+ VERSION="${{ github.event.inputs.version }}"
+ else
+ # Auto-generate version for scheduled releases
+ VERSION="v$(date +%Y.%m.%d)"
+ fi
+ echo "version=$VERSION" >> $GITHUB_OUTPUT
+ echo "Version: $VERSION"
+
+ - name: Get previous release tag
+ id: previous
+ run: |
+ PREV_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "")
+ echo "tag=$PREV_TAG" >> $GITHUB_OUTPUT
+ echo "Previous tag: $PREV_TAG"
+
+ - name: Generate changelog
+ id: changelog
+ run: |
+ python scripts/generate-changelog.py \
+ --since "${{ steps.previous.outputs.tag }}" \
+ --output changelog.md
+
+ echo "## Changelog" >> $GITHUB_STEP_SUMMARY
+ cat changelog.md >> $GITHUB_STEP_SUMMARY
+
+ - name: Build all lists
+ run: python build.py --validate
+
+ - name: Generate statistics
+ run: |
+ python build.py stats > release-stats.txt
+ echo "" >> changelog.md
+ echo "## Statistics" >> changelog.md
+ cat release-stats.txt >> changelog.md
+
+ - name: Create release archive
+ run: |
+ # Create archives for easy download
+ tar -czvf blocklists-all.tar.gz *.txt adguard/ alt-version/ dnsmasq-version/
+ zip -r blocklists-all.zip *.txt adguard/ alt-version/ dnsmasq-version/
+
+ - name: Create GitHub Release
+ uses: softprops/action-gh-release@v1
+ with:
+ tag_name: ${{ steps.version.outputs.version }}
+ name: Block List Project ${{ steps.version.outputs.version }}
+ body_path: changelog.md
+ prerelease: ${{ github.event.inputs.prerelease || false }}
+ files: |
+ blocklists-all.tar.gz
+ blocklists-all.zip
+ release-stats.txt
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- /dev/null
+# Block List Project Statistics
+
+*Last updated: 2025-12-19 17:17 UTC*
+
+## Summary
+
+| Metric | Value |
+|--------|------:|
+| Total Lists | 25 |
+| Unique Domains | 1,709,017 |
+| Total Entries (with overlap) | 2,224,422 |
+| Unique TLDs | 921 |
+
+## Domains by List
+
+| List | Domains | % of Total |
+|------|--------:|-----------:|
+| porn | 500,282 | 29.3% |
+| malware | 435,220 | 25.5% |
+| abuse | 435,155 | 25.5% |
+| fraud | 196,082 | 11.5% |
+| phishing | 190,222 | 11.1% |
+| ads | 154,554 | 9.0% |
+| redirect | 108,684 | 6.4% |
+| basic | 76,459 | 4.5% |
+| drugs | 26,031 | 1.5% |
+| youtube | 24,280 | 1.4% |
+| crypto | 23,761 | 1.4% |
+| facebook | 22,459 | 1.3% |
+| tracking | 15,070 | 0.9% |
+| tiktok | 3,699 | 0.2% |
+| torrent | 2,624 | 0.2% |
+| gambling | 2,500 | 0.1% |
+| piracy | 2,153 | 0.1% |
+| ransomware | 1,904 | 0.1% |
+| scam | 1,274 | 0.1% |
+| twitter | 1,193 | 0.1% |
+| adobe | 399 | 0.0% |
+| whatsapp | 226 | 0.0% |
+| vaping | 108 | 0.0% |
+| smart-tv | 78 | 0.0% |
+| fortnite | 5 | 0.0% |
+
+## Top TLDs Blocked
+
+| TLD | Count | % |
+|-----|------:|--:|
+| .com | 897,810 | 52.5% |
+| .net | 119,867 | 7.0% |
+| .stream | 80,114 | 4.7% |
+| .tk | 58,200 | 3.4% |
+| .info | 50,864 | 3.0% |
+| .org | 35,310 | 2.1% |
+| .ru | 32,227 | 1.9% |
+| .nl | 25,409 | 1.5% |
+| .de | 22,011 | 1.3% |
+| .pl | 16,111 | 0.9% |
+| .icu | 14,404 | 0.8% |
+| .br | 13,625 | 0.8% |
+| .cc | 13,612 | 0.8% |
+| .uk | 13,289 | 0.8% |
+| .top | 12,820 | 0.8% |
+| .us | 11,347 | 0.7% |
+| .win | 10,907 | 0.6% |
+| .fr | 10,467 | 0.6% |
+| .xyz | 9,722 | 0.6% |
+| .in | 8,983 | 0.5% |
+
+## Recent History (ads.txt as sample)
+
+| Date | Commit | Domains |
+|------|--------|--------:|
+| 2025-12-19 | 5945121 | 154,554 |
+| 2025-05-18 | 5fb69c9 | 154,554 |
+| 2025-05-18 | d2e66c5 | 154,554 |
+| 2025-05-18 | c208ed4 | 154,554 |
+| 2025-05-18 | f8e46fd | 154,554 |
+| 2025-05-18 | d6715e1 | 154,554 |
+| 2025-05-18 | c7cdeef | 154,554 |
+| 2025-05-18 | 3a43854 | 154,554 |
+| 2024-11-05 | c6c26be | 154,554 |
+| 2024-11-05 | 4fbe4d2 | 154,554 |
+
+## List Overlap
+
+Many domains appear in multiple lists. Here are the most common overlaps:
+
+| Appears in N Lists | Domains |
+|-------------------:|--------:|
+| 1 | 1,249,256 |
+| 2 | 417,760 |
+| 3 | 29,028 |
+| 4 | 12,323 |
+| 5 | 630 |
+| 6 | 20 |
+
+---
+
+*Generated by `scripts/generate-stats.py`*
--- /dev/null
+#!/usr/bin/env python3
+"""Check for dead domains that no longer resolve.
+
+This script checks if blocked domains still have DNS records.
+Domains that don't resolve are candidates for removal.
+
+Usage:
+ python scripts/check-dead-domains.py --list ads --sample 1000
+ python scripts/check-dead-domains.py --all --sample 500 --output dead-domains.txt
+"""
+
+import argparse
+import random
+import socket
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import NamedTuple
+
+# Try relative imports first
+try:
+ from src.normalize import parse_file_to_set
+except ImportError:
+ sys.path.insert(0, str(Path(__file__).parent.parent))
+ from src.normalize import parse_file_to_set
+
+
+class DomainCheckResult(NamedTuple):
+ """Result of checking a domain."""
+ domain: str
+ resolves: bool
+ error: str | None = None
+
+
+def check_domain_resolves(domain: str, timeout: float = 2.0) -> DomainCheckResult:
+ """Check if a domain has any DNS records.
+
+ Args:
+ domain: Domain name to check
+ timeout: Socket timeout in seconds
+
+ Returns:
+ DomainCheckResult with resolution status
+ """
+ socket.setdefaulttimeout(timeout)
+
+ try:
+ # Try to resolve the domain (A record)
+ socket.gethostbyname(domain)
+ return DomainCheckResult(domain=domain, resolves=True)
+ except socket.gaierror as e:
+ # Name resolution failed
+ return DomainCheckResult(domain=domain, resolves=False, error=str(e))
+ except socket.timeout:
+ return DomainCheckResult(domain=domain, resolves=False, error="timeout")
+ except Exception as e:
+ return DomainCheckResult(domain=domain, resolves=False, error=str(e))
+
+
+def check_domains_parallel(
+ domains: list[str],
+ max_workers: int = 50,
+ timeout: float = 2.0,
+ progress_callback=None,
+) -> list[DomainCheckResult]:
+ """Check multiple domains in parallel.
+
+ Args:
+ domains: List of domains to check
+ max_workers: Number of parallel threads
+ timeout: Socket timeout per domain
+ progress_callback: Optional callback for progress updates
+
+ Returns:
+ List of DomainCheckResult objects
+ """
+ results: list[DomainCheckResult] = []
+ checked = 0
+ total = len(domains)
+
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ futures = {
+ executor.submit(check_domain_resolves, domain, timeout): domain
+ for domain in domains
+ }
+
+ for future in as_completed(futures):
+ result = future.result()
+ results.append(result)
+ checked += 1
+
+ if progress_callback and checked % 100 == 0:
+ progress_callback(checked, total)
+
+ return results
+
+
+def load_domains_from_list(list_name: str) -> set[str]:
+ """Load domains from a blocklist file."""
+ root = Path(".")
+
+ # Try root .txt file
+ txt_path = root / f"{list_name}.txt"
+ if txt_path.exists():
+ return parse_file_to_set(txt_path)
+
+ return set()
+
+
+def get_all_list_names() -> list[str]:
+ """Get names of all blocklists."""
+ root = Path(".")
+ names = []
+
+ for txt_file in root.glob("*.txt"):
+ if txt_file.name not in ["README.md", "LICENSE", "everything.txt"]:
+ if not txt_file.name.startswith("."):
+ names.append(txt_file.stem)
+
+ return sorted(names)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Check for dead domains that no longer resolve"
+ )
+ parser.add_argument(
+ "--list", "-l",
+ help="Specific list to check (e.g., 'ads', 'malware')"
+ )
+ parser.add_argument(
+ "--all", "-a",
+ action="store_true",
+ help="Check all lists"
+ )
+ parser.add_argument(
+ "--sample", "-s",
+ type=int,
+ default=100,
+ help="Number of domains to sample per list (default: 100)"
+ )
+ parser.add_argument(
+ "--workers", "-w",
+ type=int,
+ default=50,
+ help="Number of parallel workers (default: 50)"
+ )
+ parser.add_argument(
+ "--timeout", "-t",
+ type=float,
+ default=2.0,
+ help="DNS timeout in seconds (default: 2.0)"
+ )
+ parser.add_argument(
+ "--output", "-o",
+ help="Output file for dead domains"
+ )
+ parser.add_argument(
+ "--verbose", "-v",
+ action="store_true",
+ help="Show detailed output"
+ )
+
+ args = parser.parse_args()
+
+ if not args.list and not args.all:
+ parser.error("Either --list or --all is required")
+
+ # Determine which lists to check
+ if args.all:
+ list_names = get_all_list_names()
+ else:
+ list_names = [args.list]
+
+ all_dead_domains: dict[str, list[str]] = {}
+ total_checked = 0
+ total_dead = 0
+
+ for list_name in list_names:
+ print(f"\n{'='*50}")
+ print(f"Checking list: {list_name}")
+ print(f"{'='*50}")
+
+ domains = load_domains_from_list(list_name)
+ if not domains:
+ print(f" No domains found for {list_name}")
+ continue
+
+ print(f" Total domains: {len(domains):,}")
+
+ # Sample domains
+ sample_size = min(args.sample, len(domains))
+ sampled = random.sample(sorted(domains), sample_size)
+ print(f" Sampling: {sample_size:,} domains")
+
+ # Check domains
+ def progress(checked, total):
+ print(f" Progress: {checked}/{total}", end="\r")
+
+ results = check_domains_parallel(
+ sampled,
+ max_workers=args.workers,
+ timeout=args.timeout,
+ progress_callback=progress if args.verbose else None,
+ )
+
+ # Analyze results
+ dead = [r for r in results if not r.resolves]
+ alive = [r for r in results if r.resolves]
+
+ total_checked += len(results)
+ total_dead += len(dead)
+
+ print("\n Results:")
+ print(f" Resolving: {len(alive):,} ({len(alive)/len(results)*100:.1f}%)")
+ print(f" Dead: {len(dead):,} ({len(dead)/len(results)*100:.1f}%)")
+
+ if dead:
+ all_dead_domains[list_name] = [r.domain for r in dead]
+
+ if args.verbose:
+ print("\n Dead domains (sample):")
+ for r in dead[:10]:
+ print(f" - {r.domain}: {r.error}")
+ if len(dead) > 10:
+ print(f" ... and {len(dead) - 10} more")
+
+ # Summary
+ print(f"\n{'='*50}")
+ print("SUMMARY")
+ print(f"{'='*50}")
+ print(f"Total domains checked: {total_checked:,}")
+ print(f"Total dead domains: {total_dead:,} ({total_dead/total_checked*100:.1f}%)")
+
+ # Estimate total dead in full lists
+ if args.sample < 10000:
+ print("\nNote: Based on sampling. Actual dead domain count may vary.")
+
+ # Output dead domains
+ if args.output and all_dead_domains:
+ with open(args.output, "w") as f:
+ f.write("# Dead domains found by check-dead-domains.py\n")
+ f.write(f"# Checked on: {__import__('datetime').datetime.now().isoformat()}\n")
+ f.write(f"# Sample size per list: {args.sample}\n\n")
+
+ for list_name, dead in sorted(all_dead_domains.items()):
+ f.write(f"\n# List: {list_name} ({len(dead)} dead)\n")
+ for domain in sorted(dead):
+ f.write(f"{domain}\n")
+
+ print(f"\nDead domains written to: {args.output}")
+
+ # Exit with error if significant dead domains found
+ dead_rate = total_dead / total_checked if total_checked > 0 else 0
+ if dead_rate > 0.5:
+ print("\nWarning: Over 50% of sampled domains are dead!")
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+#!/usr/bin/env python3
+"""Generate changelog showing domains added/removed since last release."""
+
+import argparse
+import subprocess
+from pathlib import Path
+
+
+def get_changed_files(since_tag: str | None) -> list[str]:
+ """Get list of .txt files changed since the given tag."""
+ if since_tag:
+ cmd = ["git", "diff", "--name-only", since_tag, "HEAD", "--", "*.txt"]
+ else:
+ # No previous tag, consider all files as new
+ cmd = ["git", "ls-files", "*.txt"]
+
+ result = subprocess.run(cmd, capture_output=True, text=True)
+ return [f for f in result.stdout.strip().split("\n") if f and f.endswith(".txt")]
+
+
+def get_file_at_revision(filepath: str, revision: str) -> set[str]:
+ """Get domains from a file at a specific git revision."""
+ cmd = ["git", "show", f"{revision}:{filepath}"]
+ result = subprocess.run(cmd, capture_output=True, text=True)
+
+ if result.returncode != 0:
+ return set()
+
+ domains = set()
+ for line in result.stdout.split("\n"):
+ line = line.strip()
+ if not line or line.startswith("#") or line.startswith("!"):
+ continue
+
+ # Handle hosts format
+ if line.startswith("0.0.0.0 ") or line.startswith("127.0.0.1 "):
+ parts = line.split()
+ if len(parts) >= 2:
+ domains.add(parts[1].lower())
+ # Handle domain-only format
+ elif "." in line and not line.startswith("server="):
+ domains.add(line.lower())
+
+ return domains
+
+
+def get_current_domains(filepath: str) -> set[str]:
+ """Get domains from a file in the working directory."""
+ path = Path(filepath)
+ if not path.exists():
+ return set()
+
+ domains = set()
+ for line in path.read_text(encoding="utf-8").split("\n"):
+ line = line.strip()
+ if not line or line.startswith("#") or line.startswith("!"):
+ continue
+
+ if line.startswith("0.0.0.0 ") or line.startswith("127.0.0.1 "):
+ parts = line.split()
+ if len(parts) >= 2:
+ domains.add(parts[1].lower())
+ elif "." in line and not line.startswith("server="):
+ domains.add(line.lower())
+
+ return domains
+
+
+def generate_changelog(since_tag: str | None, output_path: str) -> None:
+ """Generate a changelog showing domains added/removed."""
+ # Only look at root .txt files (the canonical source)
+ root_txt_files = [
+ f for f in Path(".").glob("*.txt")
+ if f.name not in ["README.md", "LICENSE"] and not f.name.startswith(".")
+ ]
+
+ total_added = 0
+ total_removed = 0
+ changes_by_list: dict[str, dict] = {}
+
+ for txt_file in sorted(root_txt_files):
+ list_name = txt_file.stem
+ filepath = str(txt_file)
+
+ current = get_current_domains(filepath)
+
+ if since_tag:
+ previous = get_file_at_revision(filepath, since_tag)
+ else:
+ previous = set()
+
+ added = current - previous
+ removed = previous - current
+
+ if added or removed:
+ changes_by_list[list_name] = {
+ "added": len(added),
+ "removed": len(removed),
+ "total": len(current),
+ "added_examples": sorted(added)[:5],
+ "removed_examples": sorted(removed)[:5],
+ }
+ total_added += len(added)
+ total_removed += len(removed)
+
+ # Write changelog
+ with open(output_path, "w", encoding="utf-8") as f:
+ if since_tag:
+ f.write(f"Changes since {since_tag}\n\n")
+ else:
+ f.write("Initial release\n\n")
+
+ f.write(f"**Summary:** +{total_added:,} added, -{total_removed:,} removed\n\n")
+
+ if changes_by_list:
+ f.write("### Changes by List\n\n")
+ f.write("| List | Added | Removed | Total |\n")
+ f.write("|------|------:|--------:|------:|\n")
+
+ for name, data in sorted(changes_by_list.items()):
+ f.write(f"| {name} | +{data['added']:,} | -{data['removed']:,} | {data['total']:,} |\n")
+
+ f.write("\n")
+
+ # Show example domains for significant changes
+ f.write("### Notable Changes\n\n")
+ for name, data in sorted(changes_by_list.items(), key=lambda x: x[1]["added"], reverse=True)[:5]:
+ if data["added"] > 0:
+ examples = ", ".join(f"`{d}`" for d in data["added_examples"][:3])
+ f.write(f"**{name}**: +{data['added']:,} domains (e.g., {examples})\n\n")
+ else:
+ f.write("No changes to blocklists.\n")
+
+ print(f"Changelog written to {output_path}")
+ print(f"Total: +{total_added:,} added, -{total_removed:,} removed")
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Generate release changelog")
+ parser.add_argument("--since", help="Previous git tag to compare against")
+ parser.add_argument("--output", default="changelog.md", help="Output file path")
+
+ args = parser.parse_args()
+
+ generate_changelog(args.since, args.output)
+
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+#!/usr/bin/env python3
+"""Generate statistics dashboard for the blocklist project.
+
+Creates a markdown stats page with:
+- Domain count per list and total
+- TLD distribution
+- Category breakdown
+- Historical trends (if git history available)
+"""
+
+import json
+import subprocess
+from collections import Counter, defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Try relative imports first (when run as module), fall back to direct
+try:
+ from src.normalize import parse_file_to_set
+except ImportError:
+ import sys
+ sys.path.insert(0, str(Path(__file__).parent.parent))
+ from src.normalize import parse_file_to_set
+
+
+def get_all_domains() -> dict[str, set[str]]:
+ """Load all domains from root .txt files."""
+ domains_by_list: dict[str, set[str]] = {}
+
+ root = Path(".")
+ for txt_file in sorted(root.glob("*.txt")):
+ if txt_file.name in ["README.md", "LICENSE", "everything.txt"]:
+ continue
+ if txt_file.name.startswith("."):
+ continue
+
+ list_name = txt_file.stem
+ domains = parse_file_to_set(txt_file)
+ if domains:
+ domains_by_list[list_name] = domains
+
+ return domains_by_list
+
+
+def extract_tld(domain: str) -> str:
+ """Extract the TLD from a domain."""
+ parts = domain.rsplit(".", 1)
+ return parts[-1] if len(parts) > 1 else domain
+
+
+def extract_sld(domain: str) -> str:
+ """Extract the second-level domain (e.g., 'example' from 'sub.example.com')."""
+ parts = domain.split(".")
+ if len(parts) >= 2:
+ return parts[-2]
+ return domain
+
+
+def count_tlds(all_domains: set[str]) -> Counter:
+ """Count TLD distribution across all domains."""
+ return Counter(extract_tld(d) for d in all_domains)
+
+
+def get_category_mapping() -> dict[str, list[str]]:
+ """Map categories to list names based on config."""
+ config_path = Path("config/lists.yml")
+ if not config_path.exists():
+ return {}
+
+ import yaml
+ with open(config_path) as f:
+ config = yaml.safe_load(f)
+
+ category_map: dict[str, list[str]] = defaultdict(list)
+ for name, info in config.get("lists", {}).items():
+ for cat in info.get("categories", []):
+ category_map[cat].append(name)
+
+ return dict(category_map)
+
+
+def get_historical_counts() -> list[dict]:
+ """Get domain counts from git history (last 10 commits that touched lists)."""
+ history = []
+
+ # Get commits that modified .txt files
+ cmd = [
+ "git", "log", "--format=%H %aI", "--diff-filter=M",
+ "-n", "20", "--", "*.txt"
+ ]
+ result = subprocess.run(cmd, capture_output=True, text=True)
+
+ if result.returncode != 0:
+ return history
+
+ commits = result.stdout.strip().split("\n")[:10]
+
+ for line in commits:
+ if not line.strip():
+ continue
+
+ parts = line.split(" ", 1)
+ if len(parts) != 2:
+ continue
+
+ commit_hash, date_str = parts
+
+ # Count total domains at this commit (just ads.txt as proxy for speed)
+ cmd = ["git", "show", f"{commit_hash}:ads.txt"]
+ result = subprocess.run(cmd, capture_output=True, text=True)
+
+ if result.returncode == 0:
+ count = sum(1 for line in result.stdout.split("\n")
+ if line.strip() and line.startswith("0.0.0.0 "))
+ history.append({
+ "date": date_str[:10],
+ "commit": commit_hash[:7],
+ "ads_count": count,
+ })
+
+ return history
+
+
+def generate_stats_markdown(output_path: str = "STATS.md") -> None:
+ """Generate the statistics dashboard markdown file."""
+ print("Loading domains...")
+ domains_by_list = get_all_domains()
+
+ # Calculate totals
+ all_domains: set[str] = set()
+ for domains in domains_by_list.values():
+ all_domains.update(domains)
+
+ total_unique = len(all_domains)
+ total_entries = sum(len(d) for d in domains_by_list.values())
+
+ print(f"Analyzing {total_unique:,} unique domains...")
+
+ # TLD analysis
+ tld_counts = count_tlds(all_domains)
+ top_tlds = tld_counts.most_common(20)
+
+ # Category mapping
+ category_map = get_category_mapping()
+
+ # Historical data
+ print("Fetching historical data...")
+ history = get_historical_counts()
+
+ # Generate markdown
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+
+ with open(output_path, "w", encoding="utf-8") as f:
+ f.write("# Block List Project Statistics\n\n")
+ f.write(f"*Last updated: {now}*\n\n")
+
+ # Summary
+ f.write("## Summary\n\n")
+ f.write("| Metric | Value |\n")
+ f.write("|--------|------:|\n")
+ f.write(f"| Total Lists | {len(domains_by_list)} |\n")
+ f.write(f"| Unique Domains | {total_unique:,} |\n")
+ f.write(f"| Total Entries (with overlap) | {total_entries:,} |\n")
+ f.write(f"| Unique TLDs | {len(tld_counts):,} |\n")
+ f.write("\n")
+
+ # Domain counts by list
+ f.write("## Domains by List\n\n")
+ f.write("| List | Domains | % of Total |\n")
+ f.write("|------|--------:|-----------:|\n")
+
+ sorted_lists = sorted(domains_by_list.items(), key=lambda x: len(x[1]), reverse=True)
+ for name, domains in sorted_lists:
+ pct = (len(domains) / total_unique * 100) if total_unique > 0 else 0
+ f.write(f"| {name} | {len(domains):,} | {pct:.1f}% |\n")
+ f.write("\n")
+
+ # TLD distribution
+ f.write("## Top TLDs Blocked\n\n")
+ f.write("| TLD | Count | % |\n")
+ f.write("|-----|------:|--:|\n")
+
+ for tld, count in top_tlds:
+ pct = (count / total_unique * 100) if total_unique > 0 else 0
+ f.write(f"| .{tld} | {count:,} | {pct:.1f}% |\n")
+ f.write("\n")
+
+ # Category breakdown
+ if category_map:
+ f.write("## Categories\n\n")
+ f.write("| Category | Lists | Total Domains |\n")
+ f.write("|----------|------:|--------------:|\n")
+
+ category_totals = []
+ for cat, lists in category_map.items():
+ cat_domains: set[str] = set()
+ for lst in lists:
+ if lst in domains_by_list:
+ cat_domains.update(domains_by_list[lst])
+ category_totals.append((cat, len(lists), len(cat_domains)))
+
+ for cat, num_lists, num_domains in sorted(category_totals, key=lambda x: x[2], reverse=True):
+ f.write(f"| {cat} | {num_lists} | {num_domains:,} |\n")
+ f.write("\n")
+
+ # Historical trends
+ if history:
+ f.write("## Recent History (ads.txt as sample)\n\n")
+ f.write("| Date | Commit | Domains |\n")
+ f.write("|------|--------|--------:|\n")
+
+ for entry in history:
+ f.write(f"| {entry['date']} | {entry['commit']} | {entry['ads_count']:,} |\n")
+ f.write("\n")
+
+ # Overlap analysis
+ f.write("## List Overlap\n\n")
+ f.write("Many domains appear in multiple lists. Here are the most common overlaps:\n\n")
+
+ # Count how many lists each domain appears in
+ domain_list_count: Counter = Counter()
+ for domains in domains_by_list.values():
+ for d in domains:
+ domain_list_count[d] += 1
+
+ overlap_dist = Counter(domain_list_count.values())
+ f.write("| Appears in N Lists | Domains |\n")
+ f.write("|-------------------:|--------:|\n")
+ for n in sorted(overlap_dist.keys()):
+ f.write(f"| {n} | {overlap_dist[n]:,} |\n")
+ f.write("\n")
+
+ # Footer
+ f.write("---\n\n")
+ f.write("*Generated by `scripts/generate-stats.py`*\n")
+
+ print(f"Statistics written to {output_path}")
+
+
+def generate_stats_json(output_path: str = "stats.json") -> None:
+ """Generate statistics as JSON for programmatic access."""
+ domains_by_list = get_all_domains()
+
+ all_domains: set[str] = set()
+ for domains in domains_by_list.values():
+ all_domains.update(domains)
+
+ tld_counts = count_tlds(all_domains)
+
+ stats = {
+ "generated": datetime.now(timezone.utc).isoformat(),
+ "total_unique_domains": len(all_domains),
+ "lists": {
+ name: len(domains) for name, domains in domains_by_list.items()
+ },
+ "top_tlds": dict(tld_counts.most_common(50)),
+ }
+
+ with open(output_path, "w", encoding="utf-8") as f:
+ json.dump(stats, f, indent=2)
+
+ print(f"JSON stats written to {output_path}")
+
+
+def main():
+ import argparse
+
+ parser = argparse.ArgumentParser(description="Generate statistics dashboard")
+ parser.add_argument("--output", default="STATS.md", help="Output markdown file")
+ parser.add_argument("--json", help="Also output JSON stats to this file")
+
+ args = parser.parse_args()
+
+ generate_stats_markdown(args.output)
+
+ if args.json:
+ generate_stats_json(args.json)
+
+
+if __name__ == "__main__":
+ main()