Add release automation, stats dashboard, and dead domain cleanup

author Garrett Post <garrettpost@fbcad.org>

Fri, 19 Dec 2025 17:31:00 +0000 (11:31 -0600)

committer Garrett Post <garrettpost@fbcad.org>

Fri, 19 Dec 2025 17:31:00 +0000 (11:31 -0600)
author Garrett Post <garrettpost@fbcad.org>
Fri, 19 Dec 2025 17:31:00 +0000 (11:31 -0600)
committer Garrett Post <garrettpost@fbcad.org>
Fri, 19 Dec 2025 17:31:00 +0000 (11:31 -0600)
diff --git a/.github/workflows/dead-domains.yml b/.github/workflows/dead-domains.yml

new file mode 100644 (file)

index 0000000..eb33623
--- /dev/null
+++ b/.github/workflows/dead-domains.yml
@@ -0,0 +1,124 @@
+name: Dead Domain Cleanup
+
+on:
+  workflow_dispatch:
+    inputs:
+      sample_size:
+        description: 'Domains to sample per list'
+        required: false
+        default: '500'
+      create_issue:
+        description: 'Create issue with results'
+        required: false
+        type: boolean
+        default: true
+  schedule:
+    # Monthly on the 1st at 3 AM UTC
+    - cron: '0 3 1 * *'
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  check-dead-domains:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Check for dead domains
+        id: check
+        run: |
+          SAMPLE_SIZE="${{ github.event.inputs.sample_size || '500' }}"
+          
+          python scripts/check-dead-domains.py \
+            --all \
+            --sample $SAMPLE_SIZE \
+            --output dead-domains.txt \
+            --verbose \
+            2>&1 | tee check-output.txt
+          
+          # Extract summary for issue
+          DEAD_COUNT=$(grep -oP 'Total dead domains: \K[\d,]+' check-output.txt || echo "0")
+          CHECKED_COUNT=$(grep -oP 'Total domains checked: \K[\d,]+' check-output.txt || echo "0")
+          
+          echo "dead_count=$DEAD_COUNT" >> $GITHUB_OUTPUT
+          echo "checked_count=$CHECKED_COUNT" >> $GITHUB_OUTPUT
+
+      - name: Upload results
+        uses: actions/upload-artifact@v4
+        with:
+          name: dead-domain-report
+          path: |
+            dead-domains.txt
+            check-output.txt
+          retention-days: 90
+
+      - name: Create issue with results
+        if: ${{ github.event.inputs.create_issue != 'false' && steps.check.outputs.dead_count != '0' }}
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            
+            const deadDomains = fs.readFileSync('dead-domains.txt', 'utf8');
+            const output = fs.readFileSync('check-output.txt', 'utf8');
+            
+            // Extract summary section
+            const summaryMatch = output.match(/SUMMARY[\s\S]*$/);
+            const summary = summaryMatch ? summaryMatch[0] : 'See attached artifacts.';
+            
+            const deadCount = '${{ steps.check.outputs.dead_count }}';
+            const checkedCount = '${{ steps.check.outputs.checked_count }}';
+            
+            const body = `## Dead Domain Report
+            
+            **Date:** ${new Date().toISOString().split('T')[0]}
+            **Domains Checked:** ${checkedCount}
+            **Dead Domains Found:** ${deadCount}
+            
+            ### Summary
+            
+            \`\`\`
+            ${summary}
+            \`\`\`
+            
+            ### Dead Domains Sample
+            
+            <details>
+            <summary>Click to expand (first 100 domains)</summary>
+            
+            \`\`\`
+            ${deadDomains.split('\n').slice(0, 100).join('\n')}
+            \`\`\`
+            
+            </details>
+            
+            ### Action Items
+            
+            - [ ] Review dead domains for potential removal
+            - [ ] Check if domains are intentionally parked/blocked
+            - [ ] Update lists as needed
+            
+            ---
+            *This issue was automatically generated by the Dead Domain Cleanup workflow.*
+            `;
+            
+            await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title: `[Maintenance] Dead Domain Report - ${new Date().toISOString().split('T')[0]}`,
+              body: body,
+              labels: ['maintenance', 'automated']
+            });
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml

new file mode 100644 (file)

index 0000000..defee91
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,98 @@
+name: Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag (e.g., v2.1.0)'
+        required: true
+        type: string
+      prerelease:
+        description: 'Mark as pre-release'
+        required: false
+        type: boolean
+        default: false
+  schedule:
+    # Weekly release every Monday at 6 AM UTC
+    - cron: '0 6 * * 1'
+
+permissions:
+  contents: write
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Full history for changelog
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Determine version
+        id: version
+        run: |
+          if [ -n "${{ github.event.inputs.version }}" ]; then
+            VERSION="${{ github.event.inputs.version }}"
+          else
+            # Auto-generate version for scheduled releases
+            VERSION="v$(date +%Y.%m.%d)"
+          fi
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+          echo "Version: $VERSION"
+
+      - name: Get previous release tag
+        id: previous
+        run: |
+          PREV_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "")
+          echo "tag=$PREV_TAG" >> $GITHUB_OUTPUT
+          echo "Previous tag: $PREV_TAG"
+
+      - name: Generate changelog
+        id: changelog
+        run: |
+          python scripts/generate-changelog.py \
+            --since "${{ steps.previous.outputs.tag }}" \
+            --output changelog.md
+          
+          echo "## Changelog" >> $GITHUB_STEP_SUMMARY
+          cat changelog.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Build all lists
+        run: python build.py --validate
+
+      - name: Generate statistics
+        run: |
+          python build.py stats > release-stats.txt
+          echo "" >> changelog.md
+          echo "## Statistics" >> changelog.md
+          cat release-stats.txt >> changelog.md
+
+      - name: Create release archive
+        run: |
+          # Create archives for easy download
+          tar -czvf blocklists-all.tar.gz *.txt adguard/ alt-version/ dnsmasq-version/
+          zip -r blocklists-all.zip *.txt adguard/ alt-version/ dnsmasq-version/
+
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v1
+        with:
+          tag_name: ${{ steps.version.outputs.version }}
+          name: Block List Project ${{ steps.version.outputs.version }}
+          body_path: changelog.md
+          prerelease: ${{ github.event.inputs.prerelease || false }}
+          files: |
+            blocklists-all.tar.gz
+            blocklists-all.zip
+            release-stats.txt
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/STATS.md b/STATS.md

new file mode 100644 (file)

index 0000000..b9d0963
--- /dev/null
+++ b/STATS.md
@@ -0,0 +1,99 @@
+# Block List Project Statistics
+
+*Last updated: 2025-12-19 17:17 UTC*
+
+## Summary
+
+| Metric | Value |
+|--------|------:|
+| Total Lists | 25 |
+| Unique Domains | 1,709,017 |
+| Total Entries (with overlap) | 2,224,422 |
+| Unique TLDs | 921 |
+
+## Domains by List
+
+| List | Domains | % of Total |
+|------|--------:|-----------:|
+| porn | 500,282 | 29.3% |
+| malware | 435,220 | 25.5% |
+| abuse | 435,155 | 25.5% |
+| fraud | 196,082 | 11.5% |
+| phishing | 190,222 | 11.1% |
+| ads | 154,554 | 9.0% |
+| redirect | 108,684 | 6.4% |
+| basic | 76,459 | 4.5% |
+| drugs | 26,031 | 1.5% |
+| youtube | 24,280 | 1.4% |
+| crypto | 23,761 | 1.4% |
+| facebook | 22,459 | 1.3% |
+| tracking | 15,070 | 0.9% |
+| tiktok | 3,699 | 0.2% |
+| torrent | 2,624 | 0.2% |
+| gambling | 2,500 | 0.1% |
+| piracy | 2,153 | 0.1% |
+| ransomware | 1,904 | 0.1% |
+| scam | 1,274 | 0.1% |
+| twitter | 1,193 | 0.1% |
+| adobe | 399 | 0.0% |
+| whatsapp | 226 | 0.0% |
+| vaping | 108 | 0.0% |
+| smart-tv | 78 | 0.0% |
+| fortnite | 5 | 0.0% |
+
+## Top TLDs Blocked
+
+| TLD | Count | % |
+|-----|------:|--:|
+| .com | 897,810 | 52.5% |
+| .net | 119,867 | 7.0% |
+| .stream | 80,114 | 4.7% |
+| .tk | 58,200 | 3.4% |
+| .info | 50,864 | 3.0% |
+| .org | 35,310 | 2.1% |
+| .ru | 32,227 | 1.9% |
+| .nl | 25,409 | 1.5% |
+| .de | 22,011 | 1.3% |
+| .pl | 16,111 | 0.9% |
+| .icu | 14,404 | 0.8% |
+| .br | 13,625 | 0.8% |
+| .cc | 13,612 | 0.8% |
+| .uk | 13,289 | 0.8% |
+| .top | 12,820 | 0.8% |
+| .us | 11,347 | 0.7% |
+| .win | 10,907 | 0.6% |
+| .fr | 10,467 | 0.6% |
+| .xyz | 9,722 | 0.6% |
+| .in | 8,983 | 0.5% |
+
+## Recent History (ads.txt as sample)
+
+| Date | Commit | Domains |
+|------|--------|--------:|
+| 2025-12-19 | 5945121 | 154,554 |
+| 2025-05-18 | 5fb69c9 | 154,554 |
+| 2025-05-18 | d2e66c5 | 154,554 |
+| 2025-05-18 | c208ed4 | 154,554 |
+| 2025-05-18 | f8e46fd | 154,554 |
+| 2025-05-18 | d6715e1 | 154,554 |
+| 2025-05-18 | c7cdeef | 154,554 |
+| 2025-05-18 | 3a43854 | 154,554 |
+| 2024-11-05 | c6c26be | 154,554 |
+| 2024-11-05 | 4fbe4d2 | 154,554 |
+
+## List Overlap
+
+Many domains appear in multiple lists. Here are the most common overlaps:
+
+| Appears in N Lists | Domains |
+|-------------------:|--------:|
+| 1 | 1,249,256 |
+| 2 | 417,760 |
+| 3 | 29,028 |
+| 4 | 12,323 |
+| 5 | 630 |
+| 6 | 20 |
+
+---
+
+*Generated by `scripts/generate-stats.py`*
diff --git a/scripts/check-dead-domains.py b/scripts/check-dead-domains.py

new file mode 100644 (file)

index 0000000..2de4453
--- /dev/null
+++ b/scripts/check-dead-domains.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""Check for dead domains that no longer resolve.
+
+This script checks if blocked domains still have DNS records.
+Domains that don't resolve are candidates for removal.
+
+Usage:
+    python scripts/check-dead-domains.py --list ads --sample 1000
+    python scripts/check-dead-domains.py --all --sample 500 --output dead-domains.txt
+"""
+
+import argparse
+import random
+import socket
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import NamedTuple
+
+# Try relative imports first
+try:
+    from src.normalize import parse_file_to_set
+except ImportError:
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+    from src.normalize import parse_file_to_set
+
+
+class DomainCheckResult(NamedTuple):
+    """Result of checking a domain."""
+    domain: str
+    resolves: bool
+    error: str | None = None
+
+
+def check_domain_resolves(domain: str, timeout: float = 2.0) -> DomainCheckResult:
+    """Check if a domain has any DNS records.
+    
+    Args:
+        domain: Domain name to check
+        timeout: Socket timeout in seconds
+        
+    Returns:
+        DomainCheckResult with resolution status
+    """
+    socket.setdefaulttimeout(timeout)
+    
+    try:
+        # Try to resolve the domain (A record)
+        socket.gethostbyname(domain)
+        return DomainCheckResult(domain=domain, resolves=True)
+    except socket.gaierror as e:
+        # Name resolution failed
+        return DomainCheckResult(domain=domain, resolves=False, error=str(e))
+    except socket.timeout:
+        return DomainCheckResult(domain=domain, resolves=False, error="timeout")
+    except Exception as e:
+        return DomainCheckResult(domain=domain, resolves=False, error=str(e))
+
+
+def check_domains_parallel(
+    domains: list[str],
+    max_workers: int = 50,
+    timeout: float = 2.0,
+    progress_callback=None,
+) -> list[DomainCheckResult]:
+    """Check multiple domains in parallel.
+    
+    Args:
+        domains: List of domains to check
+        max_workers: Number of parallel threads
+        timeout: Socket timeout per domain
+        progress_callback: Optional callback for progress updates
+        
+    Returns:
+        List of DomainCheckResult objects
+    """
+    results: list[DomainCheckResult] = []
+    checked = 0
+    total = len(domains)
+    
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(check_domain_resolves, domain, timeout): domain
+            for domain in domains
+        }
+        
+        for future in as_completed(futures):
+            result = future.result()
+            results.append(result)
+            checked += 1
+            
+            if progress_callback and checked % 100 == 0:
+                progress_callback(checked, total)
+    
+    return results
+
+
+def load_domains_from_list(list_name: str) -> set[str]:
+    """Load domains from a blocklist file."""
+    root = Path(".")
+    
+    # Try root .txt file
+    txt_path = root / f"{list_name}.txt"
+    if txt_path.exists():
+        return parse_file_to_set(txt_path)
+    
+    return set()
+
+
+def get_all_list_names() -> list[str]:
+    """Get names of all blocklists."""
+    root = Path(".")
+    names = []
+    
+    for txt_file in root.glob("*.txt"):
+        if txt_file.name not in ["README.md", "LICENSE", "everything.txt"]:
+            if not txt_file.name.startswith("."):
+                names.append(txt_file.stem)
+    
+    return sorted(names)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Check for dead domains that no longer resolve"
+    )
+    parser.add_argument(
+        "--list", "-l",
+        help="Specific list to check (e.g., 'ads', 'malware')"
+    )
+    parser.add_argument(
+        "--all", "-a",
+        action="store_true",
+        help="Check all lists"
+    )
+    parser.add_argument(
+        "--sample", "-s",
+        type=int,
+        default=100,
+        help="Number of domains to sample per list (default: 100)"
+    )
+    parser.add_argument(
+        "--workers", "-w",
+        type=int,
+        default=50,
+        help="Number of parallel workers (default: 50)"
+    )
+    parser.add_argument(
+        "--timeout", "-t",
+        type=float,
+        default=2.0,
+        help="DNS timeout in seconds (default: 2.0)"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        help="Output file for dead domains"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Show detailed output"
+    )
+    
+    args = parser.parse_args()
+    
+    if not args.list and not args.all:
+        parser.error("Either --list or --all is required")
+    
+    # Determine which lists to check
+    if args.all:
+        list_names = get_all_list_names()
+    else:
+        list_names = [args.list]
+    
+    all_dead_domains: dict[str, list[str]] = {}
+    total_checked = 0
+    total_dead = 0
+    
+    for list_name in list_names:
+        print(f"\n{'='*50}")
+        print(f"Checking list: {list_name}")
+        print(f"{'='*50}")
+        
+        domains = load_domains_from_list(list_name)
+        if not domains:
+            print(f"  No domains found for {list_name}")
+            continue
+        
+        print(f"  Total domains: {len(domains):,}")
+        
+        # Sample domains
+        sample_size = min(args.sample, len(domains))
+        sampled = random.sample(sorted(domains), sample_size)
+        print(f"  Sampling: {sample_size:,} domains")
+        
+        # Check domains
+        def progress(checked, total):
+            print(f"  Progress: {checked}/{total}", end="\r")
+        
+        results = check_domains_parallel(
+            sampled,
+            max_workers=args.workers,
+            timeout=args.timeout,
+            progress_callback=progress if args.verbose else None,
+        )
+        
+        # Analyze results
+        dead = [r for r in results if not r.resolves]
+        alive = [r for r in results if r.resolves]
+        
+        total_checked += len(results)
+        total_dead += len(dead)
+        
+        print("\n  Results:")
+        print(f"    Resolving: {len(alive):,} ({len(alive)/len(results)*100:.1f}%)")
+        print(f"    Dead: {len(dead):,} ({len(dead)/len(results)*100:.1f}%)")
+        
+        if dead:
+            all_dead_domains[list_name] = [r.domain for r in dead]
+            
+            if args.verbose:
+                print("\n  Dead domains (sample):")
+                for r in dead[:10]:
+                    print(f"    - {r.domain}: {r.error}")
+                if len(dead) > 10:
+                    print(f"    ... and {len(dead) - 10} more")
+    
+    # Summary
+    print(f"\n{'='*50}")
+    print("SUMMARY")
+    print(f"{'='*50}")
+    print(f"Total domains checked: {total_checked:,}")
+    print(f"Total dead domains: {total_dead:,} ({total_dead/total_checked*100:.1f}%)")
+    
+    # Estimate total dead in full lists
+    if args.sample < 10000:
+        print("\nNote: Based on sampling. Actual dead domain count may vary.")
+    
+    # Output dead domains
+    if args.output and all_dead_domains:
+        with open(args.output, "w") as f:
+            f.write("# Dead domains found by check-dead-domains.py\n")
+            f.write(f"# Checked on: {__import__('datetime').datetime.now().isoformat()}\n")
+            f.write(f"# Sample size per list: {args.sample}\n\n")
+            
+            for list_name, dead in sorted(all_dead_domains.items()):
+                f.write(f"\n# List: {list_name} ({len(dead)} dead)\n")
+                for domain in sorted(dead):
+                    f.write(f"{domain}\n")
+        
+        print(f"\nDead domains written to: {args.output}")
+    
+    # Exit with error if significant dead domains found
+    dead_rate = total_dead / total_checked if total_checked > 0 else 0
+    if dead_rate > 0.5:
+        print("\nWarning: Over 50% of sampled domains are dead!")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/generate-changelog.py b/scripts/generate-changelog.py

new file mode 100644 (file)

index 0000000..fbef50f
--- /dev/null
+++ b/scripts/generate-changelog.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""Generate changelog showing domains added/removed since last release."""
+
+import argparse
+import subprocess
+from pathlib import Path
+
+
+def get_changed_files(since_tag: str | None) -> list[str]:
+    """Get list of .txt files changed since the given tag."""
+    if since_tag:
+        cmd = ["git", "diff", "--name-only", since_tag, "HEAD", "--", "*.txt"]
+    else:
+        # No previous tag, consider all files as new
+        cmd = ["git", "ls-files", "*.txt"]
+    
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    return [f for f in result.stdout.strip().split("\n") if f and f.endswith(".txt")]
+
+
+def get_file_at_revision(filepath: str, revision: str) -> set[str]:
+    """Get domains from a file at a specific git revision."""
+    cmd = ["git", "show", f"{revision}:{filepath}"]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode != 0:
+        return set()
+    
+    domains = set()
+    for line in result.stdout.split("\n"):
+        line = line.strip()
+        if not line or line.startswith("#") or line.startswith("!"):
+            continue
+        
+        # Handle hosts format
+        if line.startswith("0.0.0.0 ") or line.startswith("127.0.0.1 "):
+            parts = line.split()
+            if len(parts) >= 2:
+                domains.add(parts[1].lower())
+        # Handle domain-only format
+        elif "." in line and not line.startswith("server="):
+            domains.add(line.lower())
+    
+    return domains
+
+
+def get_current_domains(filepath: str) -> set[str]:
+    """Get domains from a file in the working directory."""
+    path = Path(filepath)
+    if not path.exists():
+        return set()
+    
+    domains = set()
+    for line in path.read_text(encoding="utf-8").split("\n"):
+        line = line.strip()
+        if not line or line.startswith("#") or line.startswith("!"):
+            continue
+        
+        if line.startswith("0.0.0.0 ") or line.startswith("127.0.0.1 "):
+            parts = line.split()
+            if len(parts) >= 2:
+                domains.add(parts[1].lower())
+        elif "." in line and not line.startswith("server="):
+            domains.add(line.lower())
+    
+    return domains
+
+
+def generate_changelog(since_tag: str | None, output_path: str) -> None:
+    """Generate a changelog showing domains added/removed."""
+    # Only look at root .txt files (the canonical source)
+    root_txt_files = [
+        f for f in Path(".").glob("*.txt")
+        if f.name not in ["README.md", "LICENSE"] and not f.name.startswith(".")
+    ]
+    
+    total_added = 0
+    total_removed = 0
+    changes_by_list: dict[str, dict] = {}
+    
+    for txt_file in sorted(root_txt_files):
+        list_name = txt_file.stem
+        filepath = str(txt_file)
+        
+        current = get_current_domains(filepath)
+        
+        if since_tag:
+            previous = get_file_at_revision(filepath, since_tag)
+        else:
+            previous = set()
+        
+        added = current - previous
+        removed = previous - current
+        
+        if added or removed:
+            changes_by_list[list_name] = {
+                "added": len(added),
+                "removed": len(removed),
+                "total": len(current),
+                "added_examples": sorted(added)[:5],
+                "removed_examples": sorted(removed)[:5],
+            }
+            total_added += len(added)
+            total_removed += len(removed)
+    
+    # Write changelog
+    with open(output_path, "w", encoding="utf-8") as f:
+        if since_tag:
+            f.write(f"Changes since {since_tag}\n\n")
+        else:
+            f.write("Initial release\n\n")
+        
+        f.write(f"**Summary:** +{total_added:,} added, -{total_removed:,} removed\n\n")
+        
+        if changes_by_list:
+            f.write("### Changes by List\n\n")
+            f.write("| List | Added | Removed | Total |\n")
+            f.write("|------|------:|--------:|------:|\n")
+            
+            for name, data in sorted(changes_by_list.items()):
+                f.write(f"| {name} | +{data['added']:,} | -{data['removed']:,} | {data['total']:,} |\n")
+            
+            f.write("\n")
+            
+            # Show example domains for significant changes
+            f.write("### Notable Changes\n\n")
+            for name, data in sorted(changes_by_list.items(), key=lambda x: x[1]["added"], reverse=True)[:5]:
+                if data["added"] > 0:
+                    examples = ", ".join(f"`{d}`" for d in data["added_examples"][:3])
+                    f.write(f"**{name}**: +{data['added']:,} domains (e.g., {examples})\n\n")
+        else:
+            f.write("No changes to blocklists.\n")
+    
+    print(f"Changelog written to {output_path}")
+    print(f"Total: +{total_added:,} added, -{total_removed:,} removed")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate release changelog")
+    parser.add_argument("--since", help="Previous git tag to compare against")
+    parser.add_argument("--output", default="changelog.md", help="Output file path")
+    
+    args = parser.parse_args()
+    
+    generate_changelog(args.since, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/generate-stats.py b/scripts/generate-stats.py

new file mode 100644 (file)

index 0000000..b9d7eff
--- /dev/null
+++ b/scripts/generate-stats.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""Generate statistics dashboard for the blocklist project.
+
+Creates a markdown stats page with:
+- Domain count per list and total
+- TLD distribution
+- Category breakdown
+- Historical trends (if git history available)
+"""
+
+import json
+import subprocess
+from collections import Counter, defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Try relative imports first (when run as module), fall back to direct
+try:
+    from src.normalize import parse_file_to_set
+except ImportError:
+    import sys
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+    from src.normalize import parse_file_to_set
+
+
+def get_all_domains() -> dict[str, set[str]]:
+    """Load all domains from root .txt files."""
+    domains_by_list: dict[str, set[str]] = {}
+    
+    root = Path(".")
+    for txt_file in sorted(root.glob("*.txt")):
+        if txt_file.name in ["README.md", "LICENSE", "everything.txt"]:
+            continue
+        if txt_file.name.startswith("."):
+            continue
+        
+        list_name = txt_file.stem
+        domains = parse_file_to_set(txt_file)
+        if domains:
+            domains_by_list[list_name] = domains
+    
+    return domains_by_list
+
+
+def extract_tld(domain: str) -> str:
+    """Extract the TLD from a domain."""
+    parts = domain.rsplit(".", 1)
+    return parts[-1] if len(parts) > 1 else domain
+
+
+def extract_sld(domain: str) -> str:
+    """Extract the second-level domain (e.g., 'example' from 'sub.example.com')."""
+    parts = domain.split(".")
+    if len(parts) >= 2:
+        return parts[-2]
+    return domain
+
+
+def count_tlds(all_domains: set[str]) -> Counter:
+    """Count TLD distribution across all domains."""
+    return Counter(extract_tld(d) for d in all_domains)
+
+
+def get_category_mapping() -> dict[str, list[str]]:
+    """Map categories to list names based on config."""
+    config_path = Path("config/lists.yml")
+    if not config_path.exists():
+        return {}
+    
+    import yaml
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    
+    category_map: dict[str, list[str]] = defaultdict(list)
+    for name, info in config.get("lists", {}).items():
+        for cat in info.get("categories", []):
+            category_map[cat].append(name)
+    
+    return dict(category_map)
+
+
+def get_historical_counts() -> list[dict]:
+    """Get domain counts from git history (last 10 commits that touched lists)."""
+    history = []
+    
+    # Get commits that modified .txt files
+    cmd = [
+        "git", "log", "--format=%H %aI", "--diff-filter=M",
+        "-n", "20", "--", "*.txt"
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode != 0:
+        return history
+    
+    commits = result.stdout.strip().split("\n")[:10]
+    
+    for line in commits:
+        if not line.strip():
+            continue
+        
+        parts = line.split(" ", 1)
+        if len(parts) != 2:
+            continue
+        
+        commit_hash, date_str = parts
+        
+        # Count total domains at this commit (just ads.txt as proxy for speed)
+        cmd = ["git", "show", f"{commit_hash}:ads.txt"]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        
+        if result.returncode == 0:
+            count = sum(1 for line in result.stdout.split("\n") 
+                       if line.strip() and line.startswith("0.0.0.0 "))
+            history.append({
+                "date": date_str[:10],
+                "commit": commit_hash[:7],
+                "ads_count": count,
+            })
+    
+    return history
+
+
+def generate_stats_markdown(output_path: str = "STATS.md") -> None:
+    """Generate the statistics dashboard markdown file."""
+    print("Loading domains...")
+    domains_by_list = get_all_domains()
+    
+    # Calculate totals
+    all_domains: set[str] = set()
+    for domains in domains_by_list.values():
+        all_domains.update(domains)
+    
+    total_unique = len(all_domains)
+    total_entries = sum(len(d) for d in domains_by_list.values())
+    
+    print(f"Analyzing {total_unique:,} unique domains...")
+    
+    # TLD analysis
+    tld_counts = count_tlds(all_domains)
+    top_tlds = tld_counts.most_common(20)
+    
+    # Category mapping
+    category_map = get_category_mapping()
+    
+    # Historical data
+    print("Fetching historical data...")
+    history = get_historical_counts()
+    
+    # Generate markdown
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+    
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write("# Block List Project Statistics\n\n")
+        f.write(f"*Last updated: {now}*\n\n")
+        
+        # Summary
+        f.write("## Summary\n\n")
+        f.write("| Metric | Value |\n")
+        f.write("|--------|------:|\n")
+        f.write(f"| Total Lists | {len(domains_by_list)} |\n")
+        f.write(f"| Unique Domains | {total_unique:,} |\n")
+        f.write(f"| Total Entries (with overlap) | {total_entries:,} |\n")
+        f.write(f"| Unique TLDs | {len(tld_counts):,} |\n")
+        f.write("\n")
+        
+        # Domain counts by list
+        f.write("## Domains by List\n\n")
+        f.write("| List | Domains | % of Total |\n")
+        f.write("|------|--------:|-----------:|\n")
+        
+        sorted_lists = sorted(domains_by_list.items(), key=lambda x: len(x[1]), reverse=True)
+        for name, domains in sorted_lists:
+            pct = (len(domains) / total_unique * 100) if total_unique > 0 else 0
+            f.write(f"| {name} | {len(domains):,} | {pct:.1f}% |\n")
+        f.write("\n")
+        
+        # TLD distribution
+        f.write("## Top TLDs Blocked\n\n")
+        f.write("| TLD | Count | % |\n")
+        f.write("|-----|------:|--:|\n")
+        
+        for tld, count in top_tlds:
+            pct = (count / total_unique * 100) if total_unique > 0 else 0
+            f.write(f"| .{tld} | {count:,} | {pct:.1f}% |\n")
+        f.write("\n")
+        
+        # Category breakdown
+        if category_map:
+            f.write("## Categories\n\n")
+            f.write("| Category | Lists | Total Domains |\n")
+            f.write("|----------|------:|--------------:|\n")
+            
+            category_totals = []
+            for cat, lists in category_map.items():
+                cat_domains: set[str] = set()
+                for lst in lists:
+                    if lst in domains_by_list:
+                        cat_domains.update(domains_by_list[lst])
+                category_totals.append((cat, len(lists), len(cat_domains)))
+            
+            for cat, num_lists, num_domains in sorted(category_totals, key=lambda x: x[2], reverse=True):
+                f.write(f"| {cat} | {num_lists} | {num_domains:,} |\n")
+            f.write("\n")
+        
+        # Historical trends
+        if history:
+            f.write("## Recent History (ads.txt as sample)\n\n")
+            f.write("| Date | Commit | Domains |\n")
+            f.write("|------|--------|--------:|\n")
+            
+            for entry in history:
+                f.write(f"| {entry['date']} | {entry['commit']} | {entry['ads_count']:,} |\n")
+            f.write("\n")
+        
+        # Overlap analysis
+        f.write("## List Overlap\n\n")
+        f.write("Many domains appear in multiple lists. Here are the most common overlaps:\n\n")
+        
+        # Count how many lists each domain appears in
+        domain_list_count: Counter = Counter()
+        for domains in domains_by_list.values():
+            for d in domains:
+                domain_list_count[d] += 1
+        
+        overlap_dist = Counter(domain_list_count.values())
+        f.write("| Appears in N Lists | Domains |\n")
+        f.write("|-------------------:|--------:|\n")
+        for n in sorted(overlap_dist.keys()):
+            f.write(f"| {n} | {overlap_dist[n]:,} |\n")
+        f.write("\n")
+        
+        # Footer
+        f.write("---\n\n")
+        f.write("*Generated by `scripts/generate-stats.py`*\n")
+    
+    print(f"Statistics written to {output_path}")
+
+
+def generate_stats_json(output_path: str = "stats.json") -> None:
+    """Generate statistics as JSON for programmatic access."""
+    domains_by_list = get_all_domains()
+    
+    all_domains: set[str] = set()
+    for domains in domains_by_list.values():
+        all_domains.update(domains)
+    
+    tld_counts = count_tlds(all_domains)
+    
+    stats = {
+        "generated": datetime.now(timezone.utc).isoformat(),
+        "total_unique_domains": len(all_domains),
+        "lists": {
+            name: len(domains) for name, domains in domains_by_list.items()
+        },
+        "top_tlds": dict(tld_counts.most_common(50)),
+    }
+    
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(stats, f, indent=2)
+    
+    print(f"JSON stats written to {output_path}")
+
+
+def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Generate statistics dashboard")
+    parser.add_argument("--output", default="STATS.md", help="Output markdown file")
+    parser.add_argument("--json", help="Also output JSON stats to this file")
+    
+    args = parser.parse_args()
+    
+    generate_stats_markdown(args.output)
+    
+    if args.json:
+        generate_stats_json(args.json)
+
+
+if __name__ == "__main__":
+    main()
author	Garrett Post <garrettpost@fbcad.org>
	Fri, 19 Dec 2025 17:31:00 +0000 (11:31 -0600)
committer	Garrett Post <garrettpost@fbcad.org>
	Fri, 19 Dec 2025 17:31:00 +0000 (11:31 -0600)
.github/workflows/dead-domains.yml	[new file with mode: 0644]	patch \| blob
.github/workflows/release.yml	[new file with mode: 0644]	patch \| blob
STATS.md	[new file with mode: 0644]	patch \| blob
scripts/check-dead-domains.py	[new file with mode: 0644]	patch \| blob
scripts/generate-changelog.py	[new file with mode: 0644]	patch \| blob
scripts/generate-stats.py	[new file with mode: 0644]	patch \| blob