From: Garrett Post Date: Fri, 19 Dec 2025 17:31:00 +0000 (-0600) Subject: Add release automation, stats dashboard, and dead domain cleanup X-Git-Tag: v2025.12.22~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1ed7fc2fd37ae76860ce8b82acc2e09d46d5ea25;p=thirdparty%2Fblocklistproject%2Flists.git Add release automation, stats dashboard, and dead domain cleanup Features: - release.yml: Weekly releases with changelogs showing domains added/removed - dead-domains.yml: Monthly dead domain checks with auto-issue creation - generate-stats.py: Statistics dashboard (TLD distribution, categories, overlap) - generate-changelog.py: Diff-based changelog generation - check-dead-domains.py: Parallel DNS resolution checking Stats: 1.7M unique domains across 25 lists, 921 unique TLDs --- diff --git a/.github/workflows/dead-domains.yml b/.github/workflows/dead-domains.yml new file mode 100644 index 0000000..eb33623 --- /dev/null +++ b/.github/workflows/dead-domains.yml @@ -0,0 +1,124 @@ +name: Dead Domain Cleanup + +on: + workflow_dispatch: + inputs: + sample_size: + description: 'Domains to sample per list' + required: false + default: '500' + create_issue: + description: 'Create issue with results' + required: false + type: boolean + default: true + schedule: + # Monthly on the 1st at 3 AM UTC + - cron: '0 3 1 * *' + +permissions: + contents: read + issues: write + +jobs: + check-dead-domains: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + + - name: Check for dead domains + id: check + run: | + SAMPLE_SIZE="${{ github.event.inputs.sample_size || '500' }}" + + python scripts/check-dead-domains.py \ + --all \ + --sample $SAMPLE_SIZE \ + --output dead-domains.txt \ + --verbose \ + 2>&1 | tee check-output.txt + + # Extract summary for issue + DEAD_COUNT=$(grep -oP 'Total dead domains: \K[\d,]+' check-output.txt || echo "0") + CHECKED_COUNT=$(grep -oP 'Total domains checked: \K[\d,]+' check-output.txt || echo "0") + + echo "dead_count=$DEAD_COUNT" >> $GITHUB_OUTPUT + echo "checked_count=$CHECKED_COUNT" >> $GITHUB_OUTPUT + + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: dead-domain-report + path: | + dead-domains.txt + check-output.txt + retention-days: 90 + + - name: Create issue with results + if: ${{ github.event.inputs.create_issue != 'false' && steps.check.outputs.dead_count != '0' }} + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + + const deadDomains = fs.readFileSync('dead-domains.txt', 'utf8'); + const output = fs.readFileSync('check-output.txt', 'utf8'); + + // Extract summary section + const summaryMatch = output.match(/SUMMARY[\s\S]*$/); + const summary = summaryMatch ? summaryMatch[0] : 'See attached artifacts.'; + + const deadCount = '${{ steps.check.outputs.dead_count }}'; + const checkedCount = '${{ steps.check.outputs.checked_count }}'; + + const body = `## Dead Domain Report + + **Date:** ${new Date().toISOString().split('T')[0]} + **Domains Checked:** ${checkedCount} + **Dead Domains Found:** ${deadCount} + + ### Summary + + \`\`\` + ${summary} + \`\`\` + + ### Dead Domains Sample + +
+ Click to expand (first 100 domains) + + \`\`\` + ${deadDomains.split('\n').slice(0, 100).join('\n')} + \`\`\` + +
+ + ### Action Items + + - [ ] Review dead domains for potential removal + - [ ] Check if domains are intentionally parked/blocked + - [ ] Update lists as needed + + --- + *This issue was automatically generated by the Dead Domain Cleanup workflow.* + `; + + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: `[Maintenance] Dead Domain Report - ${new Date().toISOString().split('T')[0]}`, + body: body, + labels: ['maintenance', 'automated'] + }); diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..defee91 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,98 @@ +name: Release + +on: + workflow_dispatch: + inputs: + version: + description: 'Version tag (e.g., v2.1.0)' + required: true + type: string + prerelease: + description: 'Mark as pre-release' + required: false + type: boolean + default: false + schedule: + # Weekly release every Monday at 6 AM UTC + - cron: '0 6 * * 1' + +permissions: + contents: write + +jobs: + release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for changelog + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + + - name: Determine version + id: version + run: | + if [ -n "${{ github.event.inputs.version }}" ]; then + VERSION="${{ github.event.inputs.version }}" + else + # Auto-generate version for scheduled releases + VERSION="v$(date +%Y.%m.%d)" + fi + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Version: $VERSION" + + - name: Get previous release tag + id: previous + run: | + PREV_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "") + echo "tag=$PREV_TAG" >> $GITHUB_OUTPUT + echo "Previous tag: $PREV_TAG" + + - name: Generate changelog + id: changelog + run: | + python scripts/generate-changelog.py \ + --since "${{ steps.previous.outputs.tag }}" \ + --output changelog.md + + echo "## Changelog" >> $GITHUB_STEP_SUMMARY + cat changelog.md >> $GITHUB_STEP_SUMMARY + + - name: Build all lists + run: python build.py --validate + + - name: Generate statistics + run: | + python build.py stats > release-stats.txt + echo "" >> changelog.md + echo "## Statistics" >> changelog.md + cat release-stats.txt >> changelog.md + + - name: Create release archive + run: | + # Create archives for easy download + tar -czvf blocklists-all.tar.gz *.txt adguard/ alt-version/ dnsmasq-version/ + zip -r blocklists-all.zip *.txt adguard/ alt-version/ dnsmasq-version/ + + - name: Create GitHub Release + uses: softprops/action-gh-release@v1 + with: + tag_name: ${{ steps.version.outputs.version }} + name: Block List Project ${{ steps.version.outputs.version }} + body_path: changelog.md + prerelease: ${{ github.event.inputs.prerelease || false }} + files: | + blocklists-all.tar.gz + blocklists-all.zip + release-stats.txt + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/STATS.md b/STATS.md new file mode 100644 index 0000000..b9d0963 --- /dev/null +++ b/STATS.md @@ -0,0 +1,99 @@ +# Block List Project Statistics + +*Last updated: 2025-12-19 17:17 UTC* + +## Summary + +| Metric | Value | +|--------|------:| +| Total Lists | 25 | +| Unique Domains | 1,709,017 | +| Total Entries (with overlap) | 2,224,422 | +| Unique TLDs | 921 | + +## Domains by List + +| List | Domains | % of Total | +|------|--------:|-----------:| +| porn | 500,282 | 29.3% | +| malware | 435,220 | 25.5% | +| abuse | 435,155 | 25.5% | +| fraud | 196,082 | 11.5% | +| phishing | 190,222 | 11.1% | +| ads | 154,554 | 9.0% | +| redirect | 108,684 | 6.4% | +| basic | 76,459 | 4.5% | +| drugs | 26,031 | 1.5% | +| youtube | 24,280 | 1.4% | +| crypto | 23,761 | 1.4% | +| facebook | 22,459 | 1.3% | +| tracking | 15,070 | 0.9% | +| tiktok | 3,699 | 0.2% | +| torrent | 2,624 | 0.2% | +| gambling | 2,500 | 0.1% | +| piracy | 2,153 | 0.1% | +| ransomware | 1,904 | 0.1% | +| scam | 1,274 | 0.1% | +| twitter | 1,193 | 0.1% | +| adobe | 399 | 0.0% | +| whatsapp | 226 | 0.0% | +| vaping | 108 | 0.0% | +| smart-tv | 78 | 0.0% | +| fortnite | 5 | 0.0% | + +## Top TLDs Blocked + +| TLD | Count | % | +|-----|------:|--:| +| .com | 897,810 | 52.5% | +| .net | 119,867 | 7.0% | +| .stream | 80,114 | 4.7% | +| .tk | 58,200 | 3.4% | +| .info | 50,864 | 3.0% | +| .org | 35,310 | 2.1% | +| .ru | 32,227 | 1.9% | +| .nl | 25,409 | 1.5% | +| .de | 22,011 | 1.3% | +| .pl | 16,111 | 0.9% | +| .icu | 14,404 | 0.8% | +| .br | 13,625 | 0.8% | +| .cc | 13,612 | 0.8% | +| .uk | 13,289 | 0.8% | +| .top | 12,820 | 0.8% | +| .us | 11,347 | 0.7% | +| .win | 10,907 | 0.6% | +| .fr | 10,467 | 0.6% | +| .xyz | 9,722 | 0.6% | +| .in | 8,983 | 0.5% | + +## Recent History (ads.txt as sample) + +| Date | Commit | Domains | +|------|--------|--------:| +| 2025-12-19 | 5945121 | 154,554 | +| 2025-05-18 | 5fb69c9 | 154,554 | +| 2025-05-18 | d2e66c5 | 154,554 | +| 2025-05-18 | c208ed4 | 154,554 | +| 2025-05-18 | f8e46fd | 154,554 | +| 2025-05-18 | d6715e1 | 154,554 | +| 2025-05-18 | c7cdeef | 154,554 | +| 2025-05-18 | 3a43854 | 154,554 | +| 2024-11-05 | c6c26be | 154,554 | +| 2024-11-05 | 4fbe4d2 | 154,554 | + +## List Overlap + +Many domains appear in multiple lists. Here are the most common overlaps: + +| Appears in N Lists | Domains | +|-------------------:|--------:| +| 1 | 1,249,256 | +| 2 | 417,760 | +| 3 | 29,028 | +| 4 | 12,323 | +| 5 | 630 | +| 6 | 20 | + +--- + +*Generated by `scripts/generate-stats.py`* diff --git a/scripts/check-dead-domains.py b/scripts/check-dead-domains.py new file mode 100644 index 0000000..2de4453 --- /dev/null +++ b/scripts/check-dead-domains.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +"""Check for dead domains that no longer resolve. + +This script checks if blocked domains still have DNS records. +Domains that don't resolve are candidates for removal. + +Usage: + python scripts/check-dead-domains.py --list ads --sample 1000 + python scripts/check-dead-domains.py --all --sample 500 --output dead-domains.txt +""" + +import argparse +import random +import socket +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import NamedTuple + +# Try relative imports first +try: + from src.normalize import parse_file_to_set +except ImportError: + sys.path.insert(0, str(Path(__file__).parent.parent)) + from src.normalize import parse_file_to_set + + +class DomainCheckResult(NamedTuple): + """Result of checking a domain.""" + domain: str + resolves: bool + error: str | None = None + + +def check_domain_resolves(domain: str, timeout: float = 2.0) -> DomainCheckResult: + """Check if a domain has any DNS records. + + Args: + domain: Domain name to check + timeout: Socket timeout in seconds + + Returns: + DomainCheckResult with resolution status + """ + socket.setdefaulttimeout(timeout) + + try: + # Try to resolve the domain (A record) + socket.gethostbyname(domain) + return DomainCheckResult(domain=domain, resolves=True) + except socket.gaierror as e: + # Name resolution failed + return DomainCheckResult(domain=domain, resolves=False, error=str(e)) + except socket.timeout: + return DomainCheckResult(domain=domain, resolves=False, error="timeout") + except Exception as e: + return DomainCheckResult(domain=domain, resolves=False, error=str(e)) + + +def check_domains_parallel( + domains: list[str], + max_workers: int = 50, + timeout: float = 2.0, + progress_callback=None, +) -> list[DomainCheckResult]: + """Check multiple domains in parallel. + + Args: + domains: List of domains to check + max_workers: Number of parallel threads + timeout: Socket timeout per domain + progress_callback: Optional callback for progress updates + + Returns: + List of DomainCheckResult objects + """ + results: list[DomainCheckResult] = [] + checked = 0 + total = len(domains) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(check_domain_resolves, domain, timeout): domain + for domain in domains + } + + for future in as_completed(futures): + result = future.result() + results.append(result) + checked += 1 + + if progress_callback and checked % 100 == 0: + progress_callback(checked, total) + + return results + + +def load_domains_from_list(list_name: str) -> set[str]: + """Load domains from a blocklist file.""" + root = Path(".") + + # Try root .txt file + txt_path = root / f"{list_name}.txt" + if txt_path.exists(): + return parse_file_to_set(txt_path) + + return set() + + +def get_all_list_names() -> list[str]: + """Get names of all blocklists.""" + root = Path(".") + names = [] + + for txt_file in root.glob("*.txt"): + if txt_file.name not in ["README.md", "LICENSE", "everything.txt"]: + if not txt_file.name.startswith("."): + names.append(txt_file.stem) + + return sorted(names) + + +def main(): + parser = argparse.ArgumentParser( + description="Check for dead domains that no longer resolve" + ) + parser.add_argument( + "--list", "-l", + help="Specific list to check (e.g., 'ads', 'malware')" + ) + parser.add_argument( + "--all", "-a", + action="store_true", + help="Check all lists" + ) + parser.add_argument( + "--sample", "-s", + type=int, + default=100, + help="Number of domains to sample per list (default: 100)" + ) + parser.add_argument( + "--workers", "-w", + type=int, + default=50, + help="Number of parallel workers (default: 50)" + ) + parser.add_argument( + "--timeout", "-t", + type=float, + default=2.0, + help="DNS timeout in seconds (default: 2.0)" + ) + parser.add_argument( + "--output", "-o", + help="Output file for dead domains" + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Show detailed output" + ) + + args = parser.parse_args() + + if not args.list and not args.all: + parser.error("Either --list or --all is required") + + # Determine which lists to check + if args.all: + list_names = get_all_list_names() + else: + list_names = [args.list] + + all_dead_domains: dict[str, list[str]] = {} + total_checked = 0 + total_dead = 0 + + for list_name in list_names: + print(f"\n{'='*50}") + print(f"Checking list: {list_name}") + print(f"{'='*50}") + + domains = load_domains_from_list(list_name) + if not domains: + print(f" No domains found for {list_name}") + continue + + print(f" Total domains: {len(domains):,}") + + # Sample domains + sample_size = min(args.sample, len(domains)) + sampled = random.sample(sorted(domains), sample_size) + print(f" Sampling: {sample_size:,} domains") + + # Check domains + def progress(checked, total): + print(f" Progress: {checked}/{total}", end="\r") + + results = check_domains_parallel( + sampled, + max_workers=args.workers, + timeout=args.timeout, + progress_callback=progress if args.verbose else None, + ) + + # Analyze results + dead = [r for r in results if not r.resolves] + alive = [r for r in results if r.resolves] + + total_checked += len(results) + total_dead += len(dead) + + print("\n Results:") + print(f" Resolving: {len(alive):,} ({len(alive)/len(results)*100:.1f}%)") + print(f" Dead: {len(dead):,} ({len(dead)/len(results)*100:.1f}%)") + + if dead: + all_dead_domains[list_name] = [r.domain for r in dead] + + if args.verbose: + print("\n Dead domains (sample):") + for r in dead[:10]: + print(f" - {r.domain}: {r.error}") + if len(dead) > 10: + print(f" ... and {len(dead) - 10} more") + + # Summary + print(f"\n{'='*50}") + print("SUMMARY") + print(f"{'='*50}") + print(f"Total domains checked: {total_checked:,}") + print(f"Total dead domains: {total_dead:,} ({total_dead/total_checked*100:.1f}%)") + + # Estimate total dead in full lists + if args.sample < 10000: + print("\nNote: Based on sampling. Actual dead domain count may vary.") + + # Output dead domains + if args.output and all_dead_domains: + with open(args.output, "w") as f: + f.write("# Dead domains found by check-dead-domains.py\n") + f.write(f"# Checked on: {__import__('datetime').datetime.now().isoformat()}\n") + f.write(f"# Sample size per list: {args.sample}\n\n") + + for list_name, dead in sorted(all_dead_domains.items()): + f.write(f"\n# List: {list_name} ({len(dead)} dead)\n") + for domain in sorted(dead): + f.write(f"{domain}\n") + + print(f"\nDead domains written to: {args.output}") + + # Exit with error if significant dead domains found + dead_rate = total_dead / total_checked if total_checked > 0 else 0 + if dead_rate > 0.5: + print("\nWarning: Over 50% of sampled domains are dead!") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/generate-changelog.py b/scripts/generate-changelog.py new file mode 100644 index 0000000..fbef50f --- /dev/null +++ b/scripts/generate-changelog.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +"""Generate changelog showing domains added/removed since last release.""" + +import argparse +import subprocess +from pathlib import Path + + +def get_changed_files(since_tag: str | None) -> list[str]: + """Get list of .txt files changed since the given tag.""" + if since_tag: + cmd = ["git", "diff", "--name-only", since_tag, "HEAD", "--", "*.txt"] + else: + # No previous tag, consider all files as new + cmd = ["git", "ls-files", "*.txt"] + + result = subprocess.run(cmd, capture_output=True, text=True) + return [f for f in result.stdout.strip().split("\n") if f and f.endswith(".txt")] + + +def get_file_at_revision(filepath: str, revision: str) -> set[str]: + """Get domains from a file at a specific git revision.""" + cmd = ["git", "show", f"{revision}:{filepath}"] + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + return set() + + domains = set() + for line in result.stdout.split("\n"): + line = line.strip() + if not line or line.startswith("#") or line.startswith("!"): + continue + + # Handle hosts format + if line.startswith("0.0.0.0 ") or line.startswith("127.0.0.1 "): + parts = line.split() + if len(parts) >= 2: + domains.add(parts[1].lower()) + # Handle domain-only format + elif "." in line and not line.startswith("server="): + domains.add(line.lower()) + + return domains + + +def get_current_domains(filepath: str) -> set[str]: + """Get domains from a file in the working directory.""" + path = Path(filepath) + if not path.exists(): + return set() + + domains = set() + for line in path.read_text(encoding="utf-8").split("\n"): + line = line.strip() + if not line or line.startswith("#") or line.startswith("!"): + continue + + if line.startswith("0.0.0.0 ") or line.startswith("127.0.0.1 "): + parts = line.split() + if len(parts) >= 2: + domains.add(parts[1].lower()) + elif "." in line and not line.startswith("server="): + domains.add(line.lower()) + + return domains + + +def generate_changelog(since_tag: str | None, output_path: str) -> None: + """Generate a changelog showing domains added/removed.""" + # Only look at root .txt files (the canonical source) + root_txt_files = [ + f for f in Path(".").glob("*.txt") + if f.name not in ["README.md", "LICENSE"] and not f.name.startswith(".") + ] + + total_added = 0 + total_removed = 0 + changes_by_list: dict[str, dict] = {} + + for txt_file in sorted(root_txt_files): + list_name = txt_file.stem + filepath = str(txt_file) + + current = get_current_domains(filepath) + + if since_tag: + previous = get_file_at_revision(filepath, since_tag) + else: + previous = set() + + added = current - previous + removed = previous - current + + if added or removed: + changes_by_list[list_name] = { + "added": len(added), + "removed": len(removed), + "total": len(current), + "added_examples": sorted(added)[:5], + "removed_examples": sorted(removed)[:5], + } + total_added += len(added) + total_removed += len(removed) + + # Write changelog + with open(output_path, "w", encoding="utf-8") as f: + if since_tag: + f.write(f"Changes since {since_tag}\n\n") + else: + f.write("Initial release\n\n") + + f.write(f"**Summary:** +{total_added:,} added, -{total_removed:,} removed\n\n") + + if changes_by_list: + f.write("### Changes by List\n\n") + f.write("| List | Added | Removed | Total |\n") + f.write("|------|------:|--------:|------:|\n") + + for name, data in sorted(changes_by_list.items()): + f.write(f"| {name} | +{data['added']:,} | -{data['removed']:,} | {data['total']:,} |\n") + + f.write("\n") + + # Show example domains for significant changes + f.write("### Notable Changes\n\n") + for name, data in sorted(changes_by_list.items(), key=lambda x: x[1]["added"], reverse=True)[:5]: + if data["added"] > 0: + examples = ", ".join(f"`{d}`" for d in data["added_examples"][:3]) + f.write(f"**{name}**: +{data['added']:,} domains (e.g., {examples})\n\n") + else: + f.write("No changes to blocklists.\n") + + print(f"Changelog written to {output_path}") + print(f"Total: +{total_added:,} added, -{total_removed:,} removed") + + +def main(): + parser = argparse.ArgumentParser(description="Generate release changelog") + parser.add_argument("--since", help="Previous git tag to compare against") + parser.add_argument("--output", default="changelog.md", help="Output file path") + + args = parser.parse_args() + + generate_changelog(args.since, args.output) + + +if __name__ == "__main__": + main() diff --git a/scripts/generate-stats.py b/scripts/generate-stats.py new file mode 100644 index 0000000..b9d7eff --- /dev/null +++ b/scripts/generate-stats.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +"""Generate statistics dashboard for the blocklist project. + +Creates a markdown stats page with: +- Domain count per list and total +- TLD distribution +- Category breakdown +- Historical trends (if git history available) +""" + +import json +import subprocess +from collections import Counter, defaultdict +from datetime import datetime, timezone +from pathlib import Path + +# Try relative imports first (when run as module), fall back to direct +try: + from src.normalize import parse_file_to_set +except ImportError: + import sys + sys.path.insert(0, str(Path(__file__).parent.parent)) + from src.normalize import parse_file_to_set + + +def get_all_domains() -> dict[str, set[str]]: + """Load all domains from root .txt files.""" + domains_by_list: dict[str, set[str]] = {} + + root = Path(".") + for txt_file in sorted(root.glob("*.txt")): + if txt_file.name in ["README.md", "LICENSE", "everything.txt"]: + continue + if txt_file.name.startswith("."): + continue + + list_name = txt_file.stem + domains = parse_file_to_set(txt_file) + if domains: + domains_by_list[list_name] = domains + + return domains_by_list + + +def extract_tld(domain: str) -> str: + """Extract the TLD from a domain.""" + parts = domain.rsplit(".", 1) + return parts[-1] if len(parts) > 1 else domain + + +def extract_sld(domain: str) -> str: + """Extract the second-level domain (e.g., 'example' from 'sub.example.com').""" + parts = domain.split(".") + if len(parts) >= 2: + return parts[-2] + return domain + + +def count_tlds(all_domains: set[str]) -> Counter: + """Count TLD distribution across all domains.""" + return Counter(extract_tld(d) for d in all_domains) + + +def get_category_mapping() -> dict[str, list[str]]: + """Map categories to list names based on config.""" + config_path = Path("config/lists.yml") + if not config_path.exists(): + return {} + + import yaml + with open(config_path) as f: + config = yaml.safe_load(f) + + category_map: dict[str, list[str]] = defaultdict(list) + for name, info in config.get("lists", {}).items(): + for cat in info.get("categories", []): + category_map[cat].append(name) + + return dict(category_map) + + +def get_historical_counts() -> list[dict]: + """Get domain counts from git history (last 10 commits that touched lists).""" + history = [] + + # Get commits that modified .txt files + cmd = [ + "git", "log", "--format=%H %aI", "--diff-filter=M", + "-n", "20", "--", "*.txt" + ] + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + return history + + commits = result.stdout.strip().split("\n")[:10] + + for line in commits: + if not line.strip(): + continue + + parts = line.split(" ", 1) + if len(parts) != 2: + continue + + commit_hash, date_str = parts + + # Count total domains at this commit (just ads.txt as proxy for speed) + cmd = ["git", "show", f"{commit_hash}:ads.txt"] + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0: + count = sum(1 for line in result.stdout.split("\n") + if line.strip() and line.startswith("0.0.0.0 ")) + history.append({ + "date": date_str[:10], + "commit": commit_hash[:7], + "ads_count": count, + }) + + return history + + +def generate_stats_markdown(output_path: str = "STATS.md") -> None: + """Generate the statistics dashboard markdown file.""" + print("Loading domains...") + domains_by_list = get_all_domains() + + # Calculate totals + all_domains: set[str] = set() + for domains in domains_by_list.values(): + all_domains.update(domains) + + total_unique = len(all_domains) + total_entries = sum(len(d) for d in domains_by_list.values()) + + print(f"Analyzing {total_unique:,} unique domains...") + + # TLD analysis + tld_counts = count_tlds(all_domains) + top_tlds = tld_counts.most_common(20) + + # Category mapping + category_map = get_category_mapping() + + # Historical data + print("Fetching historical data...") + history = get_historical_counts() + + # Generate markdown + now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + + with open(output_path, "w", encoding="utf-8") as f: + f.write("# Block List Project Statistics\n\n") + f.write(f"*Last updated: {now}*\n\n") + + # Summary + f.write("## Summary\n\n") + f.write("| Metric | Value |\n") + f.write("|--------|------:|\n") + f.write(f"| Total Lists | {len(domains_by_list)} |\n") + f.write(f"| Unique Domains | {total_unique:,} |\n") + f.write(f"| Total Entries (with overlap) | {total_entries:,} |\n") + f.write(f"| Unique TLDs | {len(tld_counts):,} |\n") + f.write("\n") + + # Domain counts by list + f.write("## Domains by List\n\n") + f.write("| List | Domains | % of Total |\n") + f.write("|------|--------:|-----------:|\n") + + sorted_lists = sorted(domains_by_list.items(), key=lambda x: len(x[1]), reverse=True) + for name, domains in sorted_lists: + pct = (len(domains) / total_unique * 100) if total_unique > 0 else 0 + f.write(f"| {name} | {len(domains):,} | {pct:.1f}% |\n") + f.write("\n") + + # TLD distribution + f.write("## Top TLDs Blocked\n\n") + f.write("| TLD | Count | % |\n") + f.write("|-----|------:|--:|\n") + + for tld, count in top_tlds: + pct = (count / total_unique * 100) if total_unique > 0 else 0 + f.write(f"| .{tld} | {count:,} | {pct:.1f}% |\n") + f.write("\n") + + # Category breakdown + if category_map: + f.write("## Categories\n\n") + f.write("| Category | Lists | Total Domains |\n") + f.write("|----------|------:|--------------:|\n") + + category_totals = [] + for cat, lists in category_map.items(): + cat_domains: set[str] = set() + for lst in lists: + if lst in domains_by_list: + cat_domains.update(domains_by_list[lst]) + category_totals.append((cat, len(lists), len(cat_domains))) + + for cat, num_lists, num_domains in sorted(category_totals, key=lambda x: x[2], reverse=True): + f.write(f"| {cat} | {num_lists} | {num_domains:,} |\n") + f.write("\n") + + # Historical trends + if history: + f.write("## Recent History (ads.txt as sample)\n\n") + f.write("| Date | Commit | Domains |\n") + f.write("|------|--------|--------:|\n") + + for entry in history: + f.write(f"| {entry['date']} | {entry['commit']} | {entry['ads_count']:,} |\n") + f.write("\n") + + # Overlap analysis + f.write("## List Overlap\n\n") + f.write("Many domains appear in multiple lists. Here are the most common overlaps:\n\n") + + # Count how many lists each domain appears in + domain_list_count: Counter = Counter() + for domains in domains_by_list.values(): + for d in domains: + domain_list_count[d] += 1 + + overlap_dist = Counter(domain_list_count.values()) + f.write("| Appears in N Lists | Domains |\n") + f.write("|-------------------:|--------:|\n") + for n in sorted(overlap_dist.keys()): + f.write(f"| {n} | {overlap_dist[n]:,} |\n") + f.write("\n") + + # Footer + f.write("---\n\n") + f.write("*Generated by `scripts/generate-stats.py`*\n") + + print(f"Statistics written to {output_path}") + + +def generate_stats_json(output_path: str = "stats.json") -> None: + """Generate statistics as JSON for programmatic access.""" + domains_by_list = get_all_domains() + + all_domains: set[str] = set() + for domains in domains_by_list.values(): + all_domains.update(domains) + + tld_counts = count_tlds(all_domains) + + stats = { + "generated": datetime.now(timezone.utc).isoformat(), + "total_unique_domains": len(all_domains), + "lists": { + name: len(domains) for name, domains in domains_by_list.items() + }, + "top_tlds": dict(tld_counts.most_common(50)), + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(stats, f, indent=2) + + print(f"JSON stats written to {output_path}") + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Generate statistics dashboard") + parser.add_argument("--output", default="STATS.md", help="Output markdown file") + parser.add_argument("--json", help="Also output JSON stats to this file") + + args = parser.parse_args() + + generate_stats_markdown(args.output) + + if args.json: + generate_stats_json(args.json) + + +if __name__ == "__main__": + main()