--- /dev/null
+#!/usr/bin/env python3
+"""
+File deduplication script that replaces identical files with symlinks.
+Uses SHA256 hashing to identify duplicate files.
+"""
+
+import hashlib
+from collections import defaultdict
+from pathlib import Path
+
+import click
+import humanize
+
+
+def calculate_sha256(filepath: Path) -> str | None:
+ sha256_hash = hashlib.sha256()
+ try:
+ with filepath.open("rb") as f:
+ # Read file in chunks to handle large files efficiently
+ while chunk := f.read(65536): # 64KB chunks
+ sha256_hash.update(chunk)
+ return sha256_hash.hexdigest()
+ except OSError as e:
+ click.echo(f"Error reading {filepath}: {e}", err=True)
+ return None
+
+
+def find_duplicate_files(directory: Path) -> dict[str, list[Path]]:
+ """
+ Recursively scan directory and group files by their SHA256 hash.
+ Returns a dictionary mapping hash -> list of file paths.
+ """
+ hash_to_files: dict[str, list[Path]] = defaultdict(list)
+
+ for filepath in directory.rglob("*"):
+ # Skip symlinks
+ if filepath.is_symlink():
+ continue
+
+ # Skip if not a regular file
+ if not filepath.is_file():
+ continue
+
+ file_hash = calculate_sha256(filepath)
+ if file_hash:
+ hash_to_files[file_hash].append(filepath)
+
+ # Filter to only return hashes with duplicates
+ return {h: files for h, files in hash_to_files.items() if len(files) > 1}
+
+
+def replace_with_symlinks(
+ duplicate_groups: dict[str, list[Path]],
+ *,
+ dry_run: bool = False,
+) -> tuple[int, int]:
+ """
+ Replace duplicate files with symlinks to the first occurrence.
+ Returns (number_of_files_replaced, space_saved_in_bytes).
+ """
+ total_duplicates = 0
+ space_saved = 0
+
+ for file_hash, file_list in duplicate_groups.items():
+ # Keep the first file as the original, replace others with symlinks
+ original_file = file_list[0]
+ duplicates = file_list[1:]
+
+ click.echo(f"Found {len(duplicates)} duplicate(s) of: {original_file}")
+
+ for duplicate in duplicates:
+ try:
+ # Get file size before deletion
+ file_size = duplicate.stat().st_size
+
+ if dry_run:
+ click.echo(f" [DRY RUN] Would replace: {duplicate}")
+ else:
+ # Remove the duplicate file
+ duplicate.unlink()
+
+ # Create relative symlink if possible, otherwise absolute
+ try:
+ # Try to create a relative symlink
+ rel_path = original_file.relative_to(duplicate.parent)
+ duplicate.symlink_to(rel_path)
+ click.echo(f" Replaced: {duplicate} -> {rel_path}")
+ except ValueError:
+ # Fall back to absolute path
+ duplicate.symlink_to(original_file.resolve())
+ click.echo(f" Replaced: {duplicate} -> {original_file}")
+
+ space_saved += file_size
+
+ total_duplicates += 1
+
+ except OSError as e:
+ click.echo(f" Error replacing {duplicate}: {e}", err=True)
+
+ return total_duplicates, space_saved
+
+
+@click.command()
+@click.argument(
+ "directory",
+ type=click.Path(
+ exists=True,
+ file_okay=False,
+ dir_okay=True,
+ readable=True,
+ path_type=Path,
+ ),
+)
+@click.option(
+ "--dry-run",
+ is_flag=True,
+ help="Show what would be done without making changes",
+)
+@click.option("--verbose", "-v", is_flag=True, help="Show verbose output")
+def deduplicate(directory: Path, *, dry_run: bool, verbose: bool) -> None:
+ """
+ Recursively search DIRECTORY for identical files and replace them with symlinks.
+
+ Uses SHA256 hashing to identify duplicate files. The first occurrence of each
+ unique file is kept, and all duplicates are replaced with symlinks pointing to it.
+ """
+ directory = directory.resolve()
+
+ click.echo(f"Scanning directory: {directory}")
+ if dry_run:
+ click.echo("Running in DRY RUN mode - no changes will be made")
+
+ # Find all duplicate files
+ click.echo("Calculating file hashes...")
+ duplicate_groups = find_duplicate_files(directory)
+
+ if not duplicate_groups:
+ click.echo("No duplicate files found!")
+ return
+
+ total_files = sum(len(files) - 1 for files in duplicate_groups.values())
+ click.echo(
+ f"Found {len(duplicate_groups)} group(s) of duplicates "
+ f"({total_files} files to deduplicate)",
+ )
+
+ if verbose:
+ for file_hash, files in duplicate_groups.items():
+ click.echo(f"Hash: {file_hash}")
+ for f in files:
+ click.echo(f" - {f}")
+
+ # Replace duplicates with symlinks
+ click.echo("Processing duplicates...")
+ num_replaced, space_saved = replace_with_symlinks(duplicate_groups, dry_run=dry_run)
+
+ # Summary
+ click.echo(
+ f"{'Would replace' if dry_run else 'Replaced'} "
+ f"{num_replaced} duplicate file(s)",
+ )
+ if not dry_run:
+ click.echo(f"Space saved: {humanize.naturalsize(space_saved, binary=True)}")
+
+
+if __name__ == "__main__":
+ deduplicate()