Skip to content

Commit 7161650

Browse files
committed
Migrate images in workflow
1 parent 4b11ae0 commit 7161650

6 files changed

Lines changed: 252 additions & 46 deletions

File tree

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: Migrate Material Images
2+
3+
on:
4+
pull_request:
5+
branches: [main]
6+
paths:
7+
- 'data/materials/**'
8+
push:
9+
branches: [main]
10+
paths:
11+
- 'data/materials/**'
12+
13+
jobs:
14+
migrate-images:
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- name: Checkout repository
19+
uses: actions/checkout@v4
20+
with:
21+
fetch-depth: 0
22+
23+
- name: Set up Python
24+
uses: actions/setup-python@v5
25+
with:
26+
python-version: '3.12'
27+
28+
- name: Install dependencies
29+
run: |
30+
python -m pip install --upgrade pip
31+
pip install -e .
32+
33+
- name: Dry run – show changed material files (PR)
34+
if: github.event_name == 'pull_request'
35+
run: |
36+
python scripts/migrate_images.py \
37+
--dry-run \
38+
--base-ref origin/${{ github.base_ref }}
39+
40+
- name: Migrate images (push to main)
41+
if: github.event_name == 'push'
42+
env:
43+
GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcs-credentials.json
44+
run: |
45+
echo '${{ secrets.GCS_CREDENTIALS_JSON }}' > /tmp/gcs-credentials.json
46+
python scripts/migrate_images.py \
47+
--no-dry-run \
48+
--base-ref HEAD~1

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ __pycache__/
66
openprinttag/
77
db-export.json
88

9+
# Allow local image cache used by migrate-images.py
10+
!data/tmp/
11+
!data/tmp/**
12+
913
# Python build artifacts
1014
*.egg-info/
1115
*.egg
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: PLA Army green
2+
slug: pla-army-green
3+
brand:
4+
slug: generic
5+
class: FFF
6+
type: PLA
7+
abbreviation: ''
8+
tags: []
9+
certifications: []
10+
primary_color:
11+
color_rgba: '#1f6b20ff'
12+
secondary_colors: []
13+
photos:
14+
- url: /tmp/assets/generic/pla-army-green/20260205110532.jpg
15+
type: unspecified
16+
- url: http://example.com/generic/pla-army-green/20260205110342.jpg
17+
type: unspecified
18+
properties: {}
19+
uuid: c6753d1b-4618-505a-aa5a-0676e55e9545
2.59 MB
Loading
Lines changed: 170 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
This script:
66
1. Scans all material YAML files in data/materials/
77
2. Extracts image URLs from the 'photos' field
8-
3. Downloads images and saves them to assets/tmp/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
8+
3. Downloads images and saves them to tmp/assets/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
99
4. Uploads images to Google Cloud Storage
1010
5. Updates YAML files with new public URLs
1111
@@ -14,12 +14,13 @@
1414
Or standard GCS authentication via gcloud
1515
"""
1616

17+
import argparse
1718
import os
19+
import subprocess
1820
import sys
1921
import yaml
2022
import requests
2123
from pathlib import Path
22-
from urllib.parse import urlparse
2324
from google.cloud import storage
2425

2526

@@ -29,10 +30,14 @@ class MaterialImageMigration:
2930
PUBLIC_URL_BASE = "https://files.openprinttag.org"
3031

3132
def __init__(
32-
self, materials_dir: str = "data/materials", output_dir: str = "assets/tmp"
33+
self,
34+
materials_dir: str = "data/materials",
35+
output_dir: str = "tmp/assets",
36+
dry_run: bool = True,
3337
):
3438
self.materials_dir = Path(materials_dir)
3539
self.output_dir = Path(output_dir)
40+
self.dry_run = dry_run
3641
self.stats = {
3742
"total_materials": 0,
3843
"materials_with_photos": 0,
@@ -45,6 +50,12 @@ def __init__(
4550
"yaml_updated": 0,
4651
"yaml_update_failed": 0,
4752
}
53+
self.missing_files: list[str] = []
54+
55+
if dry_run:
56+
self.storage_client = None
57+
self.bucket = None
58+
return
4859

4960
# Initialize GCS client
5061
try:
@@ -58,8 +69,15 @@ def __init__(
5869
)
5970
sys.exit(1)
6071

61-
def run(self):
62-
"""Main execution method."""
72+
def run(self, files: list[Path] | None = None):
73+
"""Main execution method.
74+
75+
Args:
76+
files: Optional list of specific YAML files to process.
77+
When None, all files in materials_dir are scanned.
78+
"""
79+
if self.dry_run:
80+
print("DRY RUN – no files will be downloaded, uploaded, or modified.")
6381
print("Starting material image migration...")
6482
print(f"Materials directory: {self.materials_dir}")
6583
print(f"Output directory: {self.output_dir}")
@@ -72,16 +90,31 @@ def run(self):
7290
# Create output directory
7391
self.output_dir.mkdir(parents=True, exist_ok=True)
7492

75-
# Process all brand directories
76-
for brand_dir in sorted(self.materials_dir.iterdir()):
77-
if not brand_dir.is_dir():
78-
continue
79-
80-
brand_slug = brand_dir.name
81-
self._process_brand(brand_slug, brand_dir)
93+
if files is not None:
94+
# Process only the given files
95+
if not files:
96+
print("No material files to process.")
97+
return
98+
for material_file in sorted(files):
99+
brand_slug = material_file.parent.name
100+
self._process_material(brand_slug, material_file)
101+
else:
102+
# Process all brand directories
103+
for brand_dir in sorted(self.materials_dir.iterdir()):
104+
if not brand_dir.is_dir():
105+
continue
106+
brand_slug = brand_dir.name
107+
self._process_brand(brand_slug, brand_dir)
82108

83109
self._print_summary()
84110

111+
if self.dry_run and self.missing_files:
112+
print("\nMISSING FILES – the following sources were not found:")
113+
for path in self.missing_files:
114+
print(f" ✗ {path}")
115+
print(f"\n{len(self.missing_files)} missing file(s). Fix the issues above before running the actual migration.")
116+
sys.exit(1)
117+
85118
def _process_brand(self, brand_slug: str, brand_dir: Path):
86119
"""Process all materials for a given brand."""
87120
print(f"\nProcessing brand: {brand_slug}")
@@ -155,7 +188,7 @@ def _process_material(self, brand_slug: str, material_file: Path):
155188
urls_changed = True
156189

157190
# Write back updated YAML if any URLs changed
158-
if urls_changed:
191+
if urls_changed and not self.dry_run:
159192
self._update_yaml_file(material_file, data)
160193

161194
except Exception as e:
@@ -174,38 +207,73 @@ def _process_image(
174207
self.stats["total_photos"] += 1
175208

176209
try:
177-
# Extract filename from URL
178-
parsed_url = urlparse(url)
179-
filename = os.path.basename(parsed_url.path)
210+
# Detect whether this is a local file path or a remote URL
211+
is_local = not url.startswith(("http://", "https://"))
180212

213+
# Extract filename from path/URL
214+
filename = os.path.basename(url)
181215
if not filename:
182216
filename = f"image_{index}.jpg"
183217

184-
output_path = output_dir / filename
218+
output_path = Path(url) if is_local else output_dir / filename
185219

186220
# Check if already uploaded to new location
187221
new_url = f"{self.PUBLIC_URL_BASE}/{brand_slug}/{material_slug}/{filename}"
188222
if url == new_url:
189223
print(f" ✓ Already migrated: {filename}")
190224
return url
191225

192-
# Download if not exists locally
193-
if not output_path.exists():
194-
print(f" ⬇ Downloading: {filename}")
195-
response = requests.get(url, timeout=30, stream=True)
196-
response.raise_for_status()
197-
198-
# Save to file and count bytes
199-
total_bytes = 0
200-
with open(output_path, "wb") as f:
201-
for chunk in response.iter_content(chunk_size=8192):
202-
f.write(chunk)
203-
total_bytes += len(chunk)
204-
205-
print(f" ✓ Downloaded: {filename} ({total_bytes} bytes)")
206-
self.stats["downloaded"] += 1
207-
else:
226+
if self.dry_run:
227+
if is_local:
228+
if output_path.exists():
229+
print(f" 📁 Would upload (local): {output_path}{new_url}")
230+
self.stats["skipped"] += 1
231+
else:
232+
print(f" ✗ Local file not found: {output_path}")
233+
self.stats["failed"] += 1
234+
self.missing_files.append(str(output_path))
235+
else:
236+
# HEAD request to verify remote file exists
237+
try:
238+
response = requests.head(url, timeout=10, allow_redirects=True)
239+
if response.ok:
240+
print(f" ✓ Exists (remote): {filename} → would upload to {new_url}")
241+
self.stats["skipped"] += 1
242+
else:
243+
print(f" ✗ Remote file not found (HTTP {response.status_code}): {url}")
244+
self.stats["failed"] += 1
245+
self.missing_files.append(url)
246+
except requests.exceptions.RequestException as e:
247+
print(f" ✗ Cannot reach remote file: {url}: {e}")
248+
self.stats["failed"] += 1
249+
self.missing_files.append(url)
250+
return None
251+
252+
if is_local:
253+
# Local path – skip download, upload directly
254+
if not output_path.exists():
255+
print(f" ✗ Local file not found: {output_path}")
256+
self.stats["failed"] += 1
257+
return None
208258
self.stats["skipped"] += 1
259+
else:
260+
# Download if not exists locally
261+
if not output_path.exists():
262+
print(f" ⬇ Downloading: {filename}")
263+
response = requests.get(url, timeout=30, stream=True)
264+
response.raise_for_status()
265+
266+
# Save to file and count bytes
267+
total_bytes = 0
268+
with open(output_path, "wb") as f:
269+
for chunk in response.iter_content(chunk_size=8192):
270+
f.write(chunk)
271+
total_bytes += len(chunk)
272+
273+
print(f" ✓ Downloaded: {filename} ({total_bytes} bytes)")
274+
self.stats["downloaded"] += 1
275+
else:
276+
self.stats["skipped"] += 1
209277

210278
# Upload to Google Cloud Storage
211279
gcs_path = f"{brand_slug}/{material_slug}/{filename}"
@@ -273,10 +341,77 @@ def _print_summary(self):
273341
print("=" * 60)
274342

275343

344+
def _get_changed_files(base_ref: str, materials_dir: Path) -> list[Path]:
345+
"""Return list of changed YAML files in materials_dir since base_ref."""
346+
try:
347+
result = subprocess.run(
348+
["git", "diff", "--name-only", "--diff-filter=ACMR", f"{base_ref}...HEAD",
349+
"--", str(materials_dir)],
350+
capture_output=True,
351+
text=True,
352+
check=True,
353+
)
354+
except subprocess.CalledProcessError as e:
355+
print(f"ERROR: git diff failed: {e.stderr.strip()}")
356+
sys.exit(1)
357+
358+
paths = [
359+
Path(line)
360+
for line in result.stdout.splitlines()
361+
if line.endswith(".yaml")
362+
]
363+
return paths
364+
365+
276366
def main():
277367
"""Entry point."""
278-
migration = MaterialImageMigration()
279-
migration.run()
368+
parser = argparse.ArgumentParser(description="Migrate material images to GCS.")
369+
parser.add_argument(
370+
"--dry-run",
371+
action=argparse.BooleanOptionalAction,
372+
default=True,
373+
help="List files that would be uploaded without making any changes (default: on).",
374+
)
375+
parser.add_argument(
376+
"--base-ref",
377+
metavar="REF",
378+
help="Git ref to diff against (e.g. origin/main or HEAD~1). "
379+
"Only changed YAML files in data/materials/ will be processed.",
380+
)
381+
parser.add_argument(
382+
"--files",
383+
nargs="+",
384+
metavar="FILE",
385+
help="Explicit list of YAML files to process (used when --base-ref is not set).",
386+
)
387+
parser.add_argument(
388+
"--materials-dir",
389+
default="data/materials",
390+
metavar="DIR",
391+
help="Path to materials directory (default: data/materials).",
392+
)
393+
args = parser.parse_args()
394+
395+
migration = MaterialImageMigration(
396+
materials_dir=args.materials_dir,
397+
dry_run=args.dry_run,
398+
)
399+
400+
files: list[Path] | None = None
401+
402+
if args.base_ref:
403+
files = _get_changed_files(args.base_ref, migration.materials_dir)
404+
if not files:
405+
print(f"No changed material files found against {args.base_ref}. Nothing to do.")
406+
return
407+
print(f"Changed files ({len(files)}) against {args.base_ref}:")
408+
for f in files:
409+
print(f" {f}")
410+
print()
411+
elif args.files:
412+
files = [Path(f) for f in args.files]
413+
414+
migration.run(files=files)
280415

281416

282417
if __name__ == "__main__":

0 commit comments

Comments
 (0)