|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Script for deleting orphaned material images from Google Cloud Storage. |
| 4 | +
|
| 5 | +This script: |
| 6 | +1. Detects YAML files that were deleted or modified (via git diff) |
| 7 | +2. Compares old and new versions to find GCS-hosted photo URLs that were removed |
| 8 | +3. In dry-run mode: shows which GCS files would be deleted |
| 9 | +4. In actual run: deletes orphaned files from GCS |
| 10 | +
|
| 11 | +Environment variables required: |
| 12 | +- GOOGLE_APPLICATION_CREDENTIALS: Path to GCS service account JSON |
| 13 | +Or standard GCS authentication via gcloud |
| 14 | +""" |
| 15 | + |
| 16 | +import argparse |
| 17 | +import subprocess |
| 18 | +import sys |
| 19 | +import yaml |
| 20 | +from pathlib import Path |
| 21 | +from google.cloud import storage |
| 22 | + |
| 23 | + |
| 24 | +class MaterialImageDeletion: |
| 25 | + # Google Cloud Storage configuration |
| 26 | + GCS_BUCKET_NAME = "prusa3d-openprinttag-prod-3e31-material-db" |
| 27 | + PUBLIC_URL_BASE = "https://files.openprinttag.org" |
| 28 | + |
| 29 | + def __init__( |
| 30 | + self, |
| 31 | + materials_dir: str = "data/materials", |
| 32 | + dry_run: bool = True, |
| 33 | + ): |
| 34 | + self.materials_dir = Path(materials_dir) |
| 35 | + self.dry_run = dry_run |
| 36 | + self.stats = { |
| 37 | + "files_checked": 0, |
| 38 | + "urls_to_delete": 0, |
| 39 | + "deleted": 0, |
| 40 | + "delete_failed": 0, |
| 41 | + "not_found": 0, |
| 42 | + } |
| 43 | + |
| 44 | + if dry_run: |
| 45 | + self.storage_client = None |
| 46 | + self.bucket = None |
| 47 | + return |
| 48 | + |
| 49 | + # Initialize GCS client |
| 50 | + try: |
| 51 | + self.storage_client = storage.Client() |
| 52 | + self.bucket = self.storage_client.bucket(self.GCS_BUCKET_NAME) |
| 53 | + print(f"✓ Connected to GCS bucket: {self.GCS_BUCKET_NAME}") |
| 54 | + except Exception as e: |
| 55 | + print(f"ERROR: Failed to initialize Google Cloud Storage client: {e}") |
| 56 | + print( |
| 57 | + "Make sure GOOGLE_APPLICATION_CREDENTIALS is set or you're authenticated via gcloud" |
| 58 | + ) |
| 59 | + sys.exit(1) |
| 60 | + |
| 61 | + def _extract_gcs_urls(self, data: dict | None) -> set[str]: |
| 62 | + """Extract all GCS-hosted photo URLs from YAML data.""" |
| 63 | + urls: set[str] = set() |
| 64 | + if not data: |
| 65 | + return urls |
| 66 | + for photo in data.get("photos", []): |
| 67 | + if isinstance(photo, dict): |
| 68 | + url = photo.get("url", "") |
| 69 | + else: |
| 70 | + url = photo or "" |
| 71 | + if url.startswith(self.PUBLIC_URL_BASE): |
| 72 | + urls.add(url) |
| 73 | + return urls |
| 74 | + |
| 75 | + def _url_to_gcs_path(self, url: str) -> str: |
| 76 | + """Convert public URL to GCS blob path.""" |
| 77 | + return url[len(self.PUBLIC_URL_BASE):].lstrip("/") |
| 78 | + |
| 79 | + def _get_old_yaml_content(self, file_path: str, base_ref: str) -> dict | None: |
| 80 | + """Get the YAML content of a file at base_ref.""" |
| 81 | + try: |
| 82 | + result = subprocess.run( |
| 83 | + ["git", "show", f"{base_ref}:{file_path}"], |
| 84 | + capture_output=True, |
| 85 | + text=True, |
| 86 | + check=True, |
| 87 | + ) |
| 88 | + return yaml.safe_load(result.stdout) |
| 89 | + except subprocess.CalledProcessError: |
| 90 | + return None |
| 91 | + |
| 92 | + def find_orphaned_urls( |
| 93 | + self, changed_files: list[tuple[str, str]], base_ref: str |
| 94 | + ) -> list[str]: |
| 95 | + """Find GCS URLs that were removed from YAML files. |
| 96 | +
|
| 97 | + Args: |
| 98 | + changed_files: List of (status, file_path) tuples – status is 'D' or 'M'. |
| 99 | + base_ref: The base git ref to compare against. |
| 100 | +
|
| 101 | + Returns: |
| 102 | + Sorted list of orphaned GCS public URLs. |
| 103 | + """ |
| 104 | + orphaned: list[str] = [] |
| 105 | + |
| 106 | + for status, file_path in changed_files: |
| 107 | + self.stats["files_checked"] += 1 |
| 108 | + |
| 109 | + old_data = self._get_old_yaml_content(file_path, base_ref) |
| 110 | + old_urls = self._extract_gcs_urls(old_data) |
| 111 | + |
| 112 | + if not old_urls: |
| 113 | + continue |
| 114 | + |
| 115 | + if status == "D": |
| 116 | + # File was deleted – all its GCS URLs are orphaned |
| 117 | + new_urls: set[str] = set() |
| 118 | + else: |
| 119 | + # File was modified – find URLs that are no longer present |
| 120 | + try: |
| 121 | + new_data = yaml.safe_load( |
| 122 | + Path(file_path).read_text(encoding="utf-8") |
| 123 | + ) |
| 124 | + new_urls = self._extract_gcs_urls(new_data) |
| 125 | + except Exception: |
| 126 | + new_urls = set() |
| 127 | + |
| 128 | + removed = old_urls - new_urls |
| 129 | + for url in sorted(removed): |
| 130 | + print(f" 📋 Orphaned: {url}") |
| 131 | + orphaned.append(url) |
| 132 | + |
| 133 | + return sorted(orphaned) |
| 134 | + |
| 135 | + def run(self, changed_files: list[tuple[str, str]], base_ref: str): |
| 136 | + """Main execution method.""" |
| 137 | + if self.dry_run: |
| 138 | + print("DRY RUN – no files will be deleted from GCS.") |
| 139 | + print("Starting orphaned image cleanup...") |
| 140 | + print(f"Materials directory: {self.materials_dir}") |
| 141 | + print("-" * 60) |
| 142 | + |
| 143 | + orphaned_urls = self.find_orphaned_urls(changed_files, base_ref) |
| 144 | + self.stats["urls_to_delete"] = len(orphaned_urls) |
| 145 | + |
| 146 | + if not orphaned_urls: |
| 147 | + print("No orphaned images found.") |
| 148 | + self._print_summary() |
| 149 | + return |
| 150 | + |
| 151 | + print(f"\nFound {len(orphaned_urls)} orphaned image(s).") |
| 152 | + print() |
| 153 | + for url in orphaned_urls: |
| 154 | + gcs_path = self._url_to_gcs_path(url) |
| 155 | + if self.dry_run: |
| 156 | + print(f" 🗑 Would delete from GCS: {gcs_path}") |
| 157 | + else: |
| 158 | + self._delete_from_gcs(gcs_path) |
| 159 | + |
| 160 | + self._print_summary() |
| 161 | + |
| 162 | + def _delete_from_gcs(self, gcs_path: str): |
| 163 | + """Delete a blob from GCS.""" |
| 164 | + try: |
| 165 | + blob = self.bucket.blob(gcs_path) |
| 166 | + if not blob.exists(): |
| 167 | + print(f" ⚠ Not found in GCS (already deleted?): {gcs_path}") |
| 168 | + self.stats["not_found"] += 1 |
| 169 | + return |
| 170 | + blob.delete() |
| 171 | + print(f" ✓ Deleted from GCS: {gcs_path}") |
| 172 | + self.stats["deleted"] += 1 |
| 173 | + except Exception as e: |
| 174 | + print(f" ✗ Failed to delete {gcs_path}: {e}") |
| 175 | + self.stats["delete_failed"] += 1 |
| 176 | + |
| 177 | + def _print_summary(self): |
| 178 | + """Print deletion summary.""" |
| 179 | + print("\n" + "=" * 60) |
| 180 | + print("DELETION SUMMARY") |
| 181 | + print("=" * 60) |
| 182 | + print(f"Files checked: {self.stats['files_checked']}") |
| 183 | + print(f"Orphaned URLs found: {self.stats['urls_to_delete']}") |
| 184 | + if not self.dry_run: |
| 185 | + print(f"Successfully deleted: {self.stats['deleted']}") |
| 186 | + print(f"Not found in GCS: {self.stats['not_found']}") |
| 187 | + print(f"Delete failed: {self.stats['delete_failed']}") |
| 188 | + print("=" * 60) |
| 189 | + |
| 190 | + |
| 191 | +def _get_changed_yaml_files(base_ref: str, materials_dir: Path) -> list[tuple[str, str]]: |
| 192 | + """Return list of (status, path) for deleted or modified YAML files in materials_dir.""" |
| 193 | + try: |
| 194 | + result = subprocess.run( |
| 195 | + [ |
| 196 | + "git", "diff", "--name-status", "--diff-filter=DM", |
| 197 | + f"{base_ref}...HEAD", |
| 198 | + "--", str(materials_dir), |
| 199 | + ], |
| 200 | + capture_output=True, |
| 201 | + text=True, |
| 202 | + check=True, |
| 203 | + ) |
| 204 | + except subprocess.CalledProcessError as e: |
| 205 | + print(f"ERROR: git diff failed: {e.stderr.strip()}") |
| 206 | + sys.exit(1) |
| 207 | + |
| 208 | + files: list[tuple[str, str]] = [] |
| 209 | + for line in result.stdout.splitlines(): |
| 210 | + parts = line.split("\t", 1) |
| 211 | + if len(parts) == 2 and parts[1].endswith(".yaml"): |
| 212 | + files.append((parts[0].strip(), parts[1].strip())) |
| 213 | + return files |
| 214 | + |
| 215 | + |
| 216 | +def main(): |
| 217 | + """Entry point.""" |
| 218 | + parser = argparse.ArgumentParser( |
| 219 | + description="Delete orphaned material images from GCS." |
| 220 | + ) |
| 221 | + parser.add_argument( |
| 222 | + "--dry-run", |
| 223 | + action=argparse.BooleanOptionalAction, |
| 224 | + default=True, |
| 225 | + help="List files that would be deleted without making any changes (default: on).", |
| 226 | + ) |
| 227 | + parser.add_argument( |
| 228 | + "--base-ref", |
| 229 | + metavar="REF", |
| 230 | + required=True, |
| 231 | + help="Git ref to diff against (e.g. origin/main or HEAD~1).", |
| 232 | + ) |
| 233 | + parser.add_argument( |
| 234 | + "--materials-dir", |
| 235 | + default="data/materials", |
| 236 | + metavar="DIR", |
| 237 | + help="Path to materials directory (default: data/materials).", |
| 238 | + ) |
| 239 | + args = parser.parse_args() |
| 240 | + |
| 241 | + deletion = MaterialImageDeletion( |
| 242 | + materials_dir=args.materials_dir, |
| 243 | + dry_run=args.dry_run, |
| 244 | + ) |
| 245 | + |
| 246 | + changed_files = _get_changed_yaml_files(args.base_ref, deletion.materials_dir) |
| 247 | + if not changed_files: |
| 248 | + print( |
| 249 | + f"No deleted/modified material files found against {args.base_ref}. Nothing to do." |
| 250 | + ) |
| 251 | + return |
| 252 | + |
| 253 | + print(f"Changed/deleted files ({len(changed_files)}) against {args.base_ref}:") |
| 254 | + for status, f in changed_files: |
| 255 | + print(f" [{status}] {f}") |
| 256 | + print() |
| 257 | + |
| 258 | + deletion.run(changed_files, args.base_ref) |
| 259 | + |
| 260 | + |
| 261 | +if __name__ == "__main__": |
| 262 | + main() |
0 commit comments