Skip to content

Commit 9eeb01e

Browse files
committed
Migrate images in workflow
1 parent 15553de commit 9eeb01e

7 files changed

Lines changed: 263 additions & 46 deletions

File tree

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
name: Migrate Material Images
2+
3+
on:
4+
pull_request:
5+
branches: [main]
6+
paths:
7+
- 'data/materials/**'
8+
push:
9+
branches: [main]
10+
paths:
11+
- 'data/materials/**'
12+
13+
jobs:
14+
migrate-images:
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- name: Checkout repository
19+
uses: actions/checkout@v4
20+
with:
21+
fetch-depth: 0
22+
token: ${{ secrets.GITHUB_TOKEN }}
23+
24+
- name: Set up Python
25+
uses: actions/setup-python@v5
26+
with:
27+
python-version: '3.12'
28+
29+
- name: Install dependencies
30+
run: |
31+
python -m pip install --upgrade pip
32+
pip install -e .
33+
34+
- name: Dry run – show changed material files (PR)
35+
if: github.event_name == 'pull_request'
36+
run: |
37+
python scripts/migrate_images.py \
38+
--dry-run \
39+
--base-ref origin/${{ github.base_ref }}
40+
41+
- name: Migrate images (push to main)
42+
if: github.event_name == 'push'
43+
env:
44+
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcs-credentials.json
45+
run: |
46+
echo '${{ secrets.GCS_CREDENTIALS_JSON }}' > /tmp/gcs-credentials.json
47+
python scripts/migrate_images.py \
48+
--no-dry-run \
49+
--base-ref HEAD~1
50+
51+
- name: Remove temporary assets and commit (push to main)
52+
if: github.event_name == 'push'
53+
run: |
54+
if [ -d data/tmp ]; then
55+
git config user.name "github-actions[bot]"
56+
git config user.email "github-actions[bot]@users.noreply.github.com"
57+
git rm -rf --ignore-unmatch data/tmp/
58+
git commit -m "chore: remove temporary assets after image migration [skip ci]" || echo "Nothing to commit"
59+
git push
60+
fi

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ __pycache__/
66
openprinttag/
77
db-export.json
88

9+
# Allow local image cache used by migrate-images.py
10+
!data/tmp/
11+
!data/tmp/**
12+
913
# Python build artifacts
1014
*.egg-info/
1115
*.egg
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
name: PLA Army green
2+
slug: pla-army-green
3+
brand:
4+
slug: generic
5+
class: FFF
6+
type: PLA
7+
abbreviation: ''
8+
tags: []
9+
certifications: []
10+
primary_color:
11+
color_rgba: '#1f6b20ff'
12+
secondary_colors: []
13+
photos:
14+
- url: /tmp/assets/generic/pla-army-green/20260205110532.jpg
15+
type: unspecified
16+
properties: {}
17+
uuid: c6753d1b-4618-505a-aa5a-0676e55e9545
2.63 MB
Loading
2.59 MB
Loading
Lines changed: 171 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
This script:
66
1. Scans all material YAML files in data/materials/
77
2. Extracts image URLs from the 'photos' field
8-
3. Downloads images and saves them to assets/tmp/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
8+
3. Downloads images and saves them to tmp/assets/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
99
4. Uploads images to Google Cloud Storage
1010
5. Updates YAML files with new public URLs
1111
@@ -14,12 +14,13 @@
1414
Or standard GCS authentication via gcloud
1515
"""
1616

17+
import argparse
1718
import os
19+
import subprocess
1820
import sys
1921
import yaml
2022
import requests
2123
from pathlib import Path
22-
from urllib.parse import urlparse
2324
from google.cloud import storage
2425

2526

@@ -29,10 +30,15 @@ class MaterialImageMigration:
2930
PUBLIC_URL_BASE = "https://files.openprinttag.org"
3031

3132
def __init__(
32-
self, materials_dir: str = "data/materials", output_dir: str = "assets/tmp"
33+
self,
34+
materials_dir: str = "data/materials",
35+
output_dir: str = "tmp/assets",
36+
dry_run: bool = True,
3337
):
3438
self.materials_dir = Path(materials_dir)
39+
self.data_dir = self.materials_dir.parent
3540
self.output_dir = Path(output_dir)
41+
self.dry_run = dry_run
3642
self.stats = {
3743
"total_materials": 0,
3844
"materials_with_photos": 0,
@@ -45,6 +51,12 @@ def __init__(
4551
"yaml_updated": 0,
4652
"yaml_update_failed": 0,
4753
}
54+
self.missing_files: list[str] = []
55+
56+
if dry_run:
57+
self.storage_client = None
58+
self.bucket = None
59+
return
4860

4961
# Initialize GCS client
5062
try:
@@ -58,8 +70,15 @@ def __init__(
5870
)
5971
sys.exit(1)
6072

61-
def run(self):
62-
"""Main execution method."""
73+
def run(self, files: list[Path] | None = None):
74+
"""Main execution method.
75+
76+
Args:
77+
files: Optional list of specific YAML files to process.
78+
When None, all files in materials_dir are scanned.
79+
"""
80+
if self.dry_run:
81+
print("DRY RUN – no files will be downloaded, uploaded, or modified.")
6382
print("Starting material image migration...")
6483
print(f"Materials directory: {self.materials_dir}")
6584
print(f"Output directory: {self.output_dir}")
@@ -72,16 +91,31 @@ def run(self):
7291
# Create output directory
7392
self.output_dir.mkdir(parents=True, exist_ok=True)
7493

75-
# Process all brand directories
76-
for brand_dir in sorted(self.materials_dir.iterdir()):
77-
if not brand_dir.is_dir():
78-
continue
79-
80-
brand_slug = brand_dir.name
81-
self._process_brand(brand_slug, brand_dir)
94+
if files is not None:
95+
# Process only the given files
96+
if not files:
97+
print("No material files to process.")
98+
return
99+
for material_file in sorted(files):
100+
brand_slug = material_file.parent.name
101+
self._process_material(brand_slug, material_file)
102+
else:
103+
# Process all brand directories
104+
for brand_dir in sorted(self.materials_dir.iterdir()):
105+
if not brand_dir.is_dir():
106+
continue
107+
brand_slug = brand_dir.name
108+
self._process_brand(brand_slug, brand_dir)
82109

83110
self._print_summary()
84111

112+
if self.dry_run and self.missing_files:
113+
print("\nMISSING FILES – the following sources were not found:")
114+
for path in self.missing_files:
115+
print(f" ✗ {path}")
116+
print(f"\n{len(self.missing_files)} missing file(s). Fix the issues above before running the actual migration.")
117+
sys.exit(1)
118+
85119
def _process_brand(self, brand_slug: str, brand_dir: Path):
86120
"""Process all materials for a given brand."""
87121
print(f"\nProcessing brand: {brand_slug}")
@@ -155,7 +189,7 @@ def _process_material(self, brand_slug: str, material_file: Path):
155189
urls_changed = True
156190

157191
# Write back updated YAML if any URLs changed
158-
if urls_changed:
192+
if urls_changed and not self.dry_run:
159193
self._update_yaml_file(material_file, data)
160194

161195
except Exception as e:
@@ -174,38 +208,73 @@ def _process_image(
174208
self.stats["total_photos"] += 1
175209

176210
try:
177-
# Extract filename from URL
178-
parsed_url = urlparse(url)
179-
filename = os.path.basename(parsed_url.path)
211+
# Detect whether this is a local file path or a remote URL
212+
is_local = not url.startswith(("http://", "https://"))
180213

214+
# Extract filename from path/URL
215+
filename = os.path.basename(url)
181216
if not filename:
182217
filename = f"image_{index}.jpg"
183218

184-
output_path = output_dir / filename
219+
output_path = self.data_dir / url.lstrip("/") if is_local else output_dir / filename
185220

186221
# Check if already uploaded to new location
187222
new_url = f"{self.PUBLIC_URL_BASE}/{brand_slug}/{material_slug}/{filename}"
188223
if url == new_url:
189224
print(f" ✓ Already migrated: {filename}")
190225
return url
191226

192-
# Download if not exists locally
193-
if not output_path.exists():
194-
print(f" ⬇ Downloading: {filename}")
195-
response = requests.get(url, timeout=30, stream=True)
196-
response.raise_for_status()
197-
198-
# Save to file and count bytes
199-
total_bytes = 0
200-
with open(output_path, "wb") as f:
201-
for chunk in response.iter_content(chunk_size=8192):
202-
f.write(chunk)
203-
total_bytes += len(chunk)
204-
205-
print(f" ✓ Downloaded: {filename} ({total_bytes} bytes)")
206-
self.stats["downloaded"] += 1
207-
else:
227+
if self.dry_run:
228+
if is_local:
229+
if output_path.exists():
230+
print(f" 📁 Would upload (local): {output_path}{new_url}")
231+
self.stats["skipped"] += 1
232+
else:
233+
print(f" ✗ Local file not found: {output_path}")
234+
self.stats["failed"] += 1
235+
self.missing_files.append(str(output_path))
236+
else:
237+
# HEAD request to verify remote file exists
238+
try:
239+
response = requests.head(url, timeout=10, allow_redirects=True)
240+
if response.ok:
241+
print(f" ✓ Exists (remote): {filename} → would upload to {new_url}")
242+
self.stats["skipped"] += 1
243+
else:
244+
print(f" ✗ Remote file not found (HTTP {response.status_code}): {url}")
245+
self.stats["failed"] += 1
246+
self.missing_files.append(url)
247+
except requests.exceptions.RequestException as e:
248+
print(f" ✗ Cannot reach remote file: {url}: {e}")
249+
self.stats["failed"] += 1
250+
self.missing_files.append(url)
251+
return None
252+
253+
if is_local:
254+
# Local path – skip download, upload directly
255+
if not output_path.exists():
256+
print(f" ✗ Local file not found: {output_path}")
257+
self.stats["failed"] += 1
258+
return None
208259
self.stats["skipped"] += 1
260+
else:
261+
# Download if not exists locally
262+
if not output_path.exists():
263+
print(f" ⬇ Downloading: {filename}")
264+
response = requests.get(url, timeout=30, stream=True)
265+
response.raise_for_status()
266+
267+
# Save to file and count bytes
268+
total_bytes = 0
269+
with open(output_path, "wb") as f:
270+
for chunk in response.iter_content(chunk_size=8192):
271+
f.write(chunk)
272+
total_bytes += len(chunk)
273+
274+
print(f" ✓ Downloaded: {filename} ({total_bytes} bytes)")
275+
self.stats["downloaded"] += 1
276+
else:
277+
self.stats["skipped"] += 1
209278

210279
# Upload to Google Cloud Storage
211280
gcs_path = f"{brand_slug}/{material_slug}/{filename}"
@@ -273,10 +342,77 @@ def _print_summary(self):
273342
print("=" * 60)
274343

275344

345+
def _get_changed_files(base_ref: str, materials_dir: Path) -> list[Path]:
346+
"""Return list of changed YAML files in materials_dir since base_ref."""
347+
try:
348+
result = subprocess.run(
349+
["git", "diff", "--name-only", "--diff-filter=ACMR", f"{base_ref}...HEAD",
350+
"--", str(materials_dir)],
351+
capture_output=True,
352+
text=True,
353+
check=True,
354+
)
355+
except subprocess.CalledProcessError as e:
356+
print(f"ERROR: git diff failed: {e.stderr.strip()}")
357+
sys.exit(1)
358+
359+
paths = [
360+
Path(line)
361+
for line in result.stdout.splitlines()
362+
if line.endswith(".yaml")
363+
]
364+
return paths
365+
366+
276367
def main():
277368
"""Entry point."""
278-
migration = MaterialImageMigration()
279-
migration.run()
369+
parser = argparse.ArgumentParser(description="Migrate material images to GCS.")
370+
parser.add_argument(
371+
"--dry-run",
372+
action=argparse.BooleanOptionalAction,
373+
default=True,
374+
help="List files that would be uploaded without making any changes (default: on).",
375+
)
376+
parser.add_argument(
377+
"--base-ref",
378+
metavar="REF",
379+
help="Git ref to diff against (e.g. origin/main or HEAD~1). "
380+
"Only changed YAML files in data/materials/ will be processed.",
381+
)
382+
parser.add_argument(
383+
"--files",
384+
nargs="+",
385+
metavar="FILE",
386+
help="Explicit list of YAML files to process (used when --base-ref is not set).",
387+
)
388+
parser.add_argument(
389+
"--materials-dir",
390+
default="data/materials",
391+
metavar="DIR",
392+
help="Path to materials directory (default: data/materials).",
393+
)
394+
args = parser.parse_args()
395+
396+
migration = MaterialImageMigration(
397+
materials_dir=args.materials_dir,
398+
dry_run=args.dry_run,
399+
)
400+
401+
files: list[Path] | None = None
402+
403+
if args.base_ref:
404+
files = _get_changed_files(args.base_ref, migration.materials_dir)
405+
if not files:
406+
print(f"No changed material files found against {args.base_ref}. Nothing to do.")
407+
return
408+
print(f"Changed files ({len(files)}) against {args.base_ref}:")
409+
for f in files:
410+
print(f" {f}")
411+
print()
412+
elif args.files:
413+
files = [Path(f) for f in args.files]
414+
415+
migration.run(files=files)
280416

281417

282418
if __name__ == "__main__":

0 commit comments

Comments
 (0)