Skip to content

Commit 975c5a1

Browse files
authored
Fix migrate-image ci (#54)
* Fix migrate-image ci * delete-image script
1 parent a1bd057 commit 975c5a1

7 files changed

Lines changed: 541 additions & 7 deletions

File tree

.github/workflows/migrate-images.yml

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ on:
1313
jobs:
1414
migrate-images:
1515
runs-on: ubuntu-24.04
16+
permissions:
17+
contents: write
1618

1719
steps:
1820
- name: Checkout repository
@@ -38,6 +40,13 @@ jobs:
3840
--dry-run \
3941
--base-ref origin/${{ github.base_ref }}
4042
43+
- name: Dry run – show orphaned images to delete (PR)
44+
if: github.event_name == 'pull_request'
45+
run: |
46+
python scripts/delete_images.py \
47+
--dry-run \
48+
--base-ref origin/${{ github.base_ref }}
49+
4150
- name: Migrate images (push to main)
4251
if: github.event_name == 'push'
4352
env:
@@ -48,13 +57,27 @@ jobs:
4857
--no-dry-run \
4958
--base-ref HEAD~1
5059
51-
- name: Remove temporary assets and commit (push to main)
60+
- name: Delete orphaned images from GCS (push to main)
61+
if: github.event_name == 'push'
62+
env:
63+
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcs-credentials.json
64+
run: |
65+
echo '${{ secrets.GCS_CREDENTIALS_JSON }}' > /tmp/gcs-credentials.json
66+
python scripts/delete_images.py \
67+
--no-dry-run \
68+
--base-ref HEAD~1
69+
70+
- name: Commit updated YAMLs and remove temporary assets (push to main)
5271
if: github.event_name == 'push'
5372
run: |
54-
if [ -d data/tmp ]; then
55-
git config user.name "github-actions[bot]"
56-
git config user.email "github-actions[bot]@users.noreply.github.com"
57-
git rm -rf --ignore-unmatch data/tmp/
58-
git commit -m "chore: remove temporary assets after image migration [skip ci]" || echo "Nothing to commit"
73+
git config user.name "github-actions[bot]"
74+
git config user.email "github-actions[bot]@users.noreply.github.com"
75+
git rm -rf --ignore-unmatch data/tmp/
76+
git add -u data/materials/
77+
if git diff --staged --quiet; then
78+
echo "No changes to commit"
79+
else
80+
git commit -m "chore: migrate images to GCS and remove temporary assets [skip ci]"
81+
git pull --rebase
5982
git push
6083
fi

data/materials/generic/pla-army-green.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ primary_color:
1111
color_rgba: '#1f6b20ff'
1212
secondary_colors: []
1313
photos:
14-
- url: /tmp/assets/generic/pla-army-green/20260205110532.jpg
14+
- url: /tmp/assets/generic/pla-army-green/20260205111104.jpg
1515
type: unspecified
1616
properties: {}
1717
uuid: c6753d1b-4618-505a-aa5a-0676e55e9545
-2.63 MB
Binary file not shown.
-2.59 MB
Binary file not shown.
2.61 MB
Loading

scripts/delete_images.py

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Script for deleting orphaned material images from Google Cloud Storage.
4+
5+
This script:
6+
1. Detects YAML files that were deleted or modified (via git diff)
7+
2. Compares old and new versions to find GCS-hosted photo URLs that were removed
8+
3. In dry-run mode: shows which GCS files would be deleted
9+
4. In actual run: deletes orphaned files from GCS
10+
11+
Environment variables required:
12+
- GOOGLE_APPLICATION_CREDENTIALS: Path to GCS service account JSON
13+
Or standard GCS authentication via gcloud
14+
"""
15+
16+
import argparse
17+
import subprocess
18+
import sys
19+
import yaml
20+
from pathlib import Path
21+
from google.cloud import storage
22+
23+
24+
class MaterialImageDeletion:
25+
# Google Cloud Storage configuration
26+
GCS_BUCKET_NAME = "prusa3d-openprinttag-prod-3e31-material-db"
27+
PUBLIC_URL_BASE = "https://files.openprinttag.org"
28+
29+
def __init__(
30+
self,
31+
materials_dir: str = "data/materials",
32+
dry_run: bool = True,
33+
):
34+
self.materials_dir = Path(materials_dir)
35+
self.dry_run = dry_run
36+
self.stats = {
37+
"files_checked": 0,
38+
"urls_to_delete": 0,
39+
"deleted": 0,
40+
"delete_failed": 0,
41+
"not_found": 0,
42+
}
43+
44+
if dry_run:
45+
self.storage_client = None
46+
self.bucket = None
47+
return
48+
49+
# Initialize GCS client
50+
try:
51+
self.storage_client = storage.Client()
52+
self.bucket = self.storage_client.bucket(self.GCS_BUCKET_NAME)
53+
print(f"✓ Connected to GCS bucket: {self.GCS_BUCKET_NAME}")
54+
except Exception as e:
55+
print(f"ERROR: Failed to initialize Google Cloud Storage client: {e}")
56+
print(
57+
"Make sure GOOGLE_APPLICATION_CREDENTIALS is set or you're authenticated via gcloud"
58+
)
59+
sys.exit(1)
60+
61+
def _extract_gcs_urls(self, data: dict | None) -> set[str]:
62+
"""Extract all GCS-hosted photo URLs from YAML data."""
63+
urls: set[str] = set()
64+
if not data:
65+
return urls
66+
for photo in data.get("photos", []):
67+
if isinstance(photo, dict):
68+
url = photo.get("url", "")
69+
else:
70+
url = photo or ""
71+
if url.startswith(self.PUBLIC_URL_BASE):
72+
urls.add(url)
73+
return urls
74+
75+
def _url_to_gcs_path(self, url: str) -> str:
76+
"""Convert public URL to GCS blob path."""
77+
return url[len(self.PUBLIC_URL_BASE):].lstrip("/")
78+
79+
def _get_old_yaml_content(self, file_path: str, base_ref: str) -> dict | None:
80+
"""Get the YAML content of a file at base_ref."""
81+
try:
82+
result = subprocess.run(
83+
["git", "show", f"{base_ref}:{file_path}"],
84+
capture_output=True,
85+
text=True,
86+
check=True,
87+
)
88+
return yaml.safe_load(result.stdout)
89+
except subprocess.CalledProcessError:
90+
return None
91+
92+
def find_orphaned_urls(
93+
self, changed_files: list[tuple[str, str]], base_ref: str
94+
) -> list[str]:
95+
"""Find GCS URLs that were removed from YAML files.
96+
97+
Args:
98+
changed_files: List of (status, file_path) tuples – status is 'D' or 'M'.
99+
base_ref: The base git ref to compare against.
100+
101+
Returns:
102+
Sorted list of orphaned GCS public URLs.
103+
"""
104+
orphaned: list[str] = []
105+
106+
for status, file_path in changed_files:
107+
self.stats["files_checked"] += 1
108+
109+
old_data = self._get_old_yaml_content(file_path, base_ref)
110+
old_urls = self._extract_gcs_urls(old_data)
111+
112+
if not old_urls:
113+
continue
114+
115+
if status == "D":
116+
# File was deleted – all its GCS URLs are orphaned
117+
new_urls: set[str] = set()
118+
else:
119+
# File was modified – find URLs that are no longer present
120+
try:
121+
new_data = yaml.safe_load(
122+
Path(file_path).read_text(encoding="utf-8")
123+
)
124+
new_urls = self._extract_gcs_urls(new_data)
125+
except Exception:
126+
new_urls = set()
127+
128+
removed = old_urls - new_urls
129+
for url in sorted(removed):
130+
print(f" 📋 Orphaned: {url}")
131+
orphaned.append(url)
132+
133+
return sorted(orphaned)
134+
135+
def run(self, changed_files: list[tuple[str, str]], base_ref: str):
136+
"""Main execution method."""
137+
if self.dry_run:
138+
print("DRY RUN – no files will be deleted from GCS.")
139+
print("Starting orphaned image cleanup...")
140+
print(f"Materials directory: {self.materials_dir}")
141+
print("-" * 60)
142+
143+
orphaned_urls = self.find_orphaned_urls(changed_files, base_ref)
144+
self.stats["urls_to_delete"] = len(orphaned_urls)
145+
146+
if not orphaned_urls:
147+
print("No orphaned images found.")
148+
self._print_summary()
149+
return
150+
151+
print(f"\nFound {len(orphaned_urls)} orphaned image(s).")
152+
print()
153+
for url in orphaned_urls:
154+
gcs_path = self._url_to_gcs_path(url)
155+
if self.dry_run:
156+
print(f" 🗑 Would delete from GCS: {gcs_path}")
157+
else:
158+
self._delete_from_gcs(gcs_path)
159+
160+
self._print_summary()
161+
162+
def _delete_from_gcs(self, gcs_path: str):
163+
"""Delete a blob from GCS."""
164+
try:
165+
blob = self.bucket.blob(gcs_path)
166+
if not blob.exists():
167+
print(f" ⚠ Not found in GCS (already deleted?): {gcs_path}")
168+
self.stats["not_found"] += 1
169+
return
170+
blob.delete()
171+
print(f" ✓ Deleted from GCS: {gcs_path}")
172+
self.stats["deleted"] += 1
173+
except Exception as e:
174+
print(f" ✗ Failed to delete {gcs_path}: {e}")
175+
self.stats["delete_failed"] += 1
176+
177+
def _print_summary(self):
178+
"""Print deletion summary."""
179+
print("\n" + "=" * 60)
180+
print("DELETION SUMMARY")
181+
print("=" * 60)
182+
print(f"Files checked: {self.stats['files_checked']}")
183+
print(f"Orphaned URLs found: {self.stats['urls_to_delete']}")
184+
if not self.dry_run:
185+
print(f"Successfully deleted: {self.stats['deleted']}")
186+
print(f"Not found in GCS: {self.stats['not_found']}")
187+
print(f"Delete failed: {self.stats['delete_failed']}")
188+
print("=" * 60)
189+
190+
191+
def _get_changed_yaml_files(base_ref: str, materials_dir: Path) -> list[tuple[str, str]]:
192+
"""Return list of (status, path) for deleted or modified YAML files in materials_dir."""
193+
try:
194+
result = subprocess.run(
195+
[
196+
"git", "diff", "--name-status", "--diff-filter=DM",
197+
f"{base_ref}...HEAD",
198+
"--", str(materials_dir),
199+
],
200+
capture_output=True,
201+
text=True,
202+
check=True,
203+
)
204+
except subprocess.CalledProcessError as e:
205+
print(f"ERROR: git diff failed: {e.stderr.strip()}")
206+
sys.exit(1)
207+
208+
files: list[tuple[str, str]] = []
209+
for line in result.stdout.splitlines():
210+
parts = line.split("\t", 1)
211+
if len(parts) == 2 and parts[1].endswith(".yaml"):
212+
files.append((parts[0].strip(), parts[1].strip()))
213+
return files
214+
215+
216+
def main():
217+
"""Entry point."""
218+
parser = argparse.ArgumentParser(
219+
description="Delete orphaned material images from GCS."
220+
)
221+
parser.add_argument(
222+
"--dry-run",
223+
action=argparse.BooleanOptionalAction,
224+
default=True,
225+
help="List files that would be deleted without making any changes (default: on).",
226+
)
227+
parser.add_argument(
228+
"--base-ref",
229+
metavar="REF",
230+
required=True,
231+
help="Git ref to diff against (e.g. origin/main or HEAD~1).",
232+
)
233+
parser.add_argument(
234+
"--materials-dir",
235+
default="data/materials",
236+
metavar="DIR",
237+
help="Path to materials directory (default: data/materials).",
238+
)
239+
args = parser.parse_args()
240+
241+
deletion = MaterialImageDeletion(
242+
materials_dir=args.materials_dir,
243+
dry_run=args.dry_run,
244+
)
245+
246+
changed_files = _get_changed_yaml_files(args.base_ref, deletion.materials_dir)
247+
if not changed_files:
248+
print(
249+
f"No deleted/modified material files found against {args.base_ref}. Nothing to do."
250+
)
251+
return
252+
253+
print(f"Changed/deleted files ({len(changed_files)}) against {args.base_ref}:")
254+
for status, f in changed_files:
255+
print(f" [{status}] {f}")
256+
print()
257+
258+
deletion.run(changed_files, args.base_ref)
259+
260+
261+
if __name__ == "__main__":
262+
main()

0 commit comments

Comments
 (0)