Migrate images in workflow

ondratu · ondratu · commit 7161650c0bec · 2026-02-26T09:21:54.000+01:00
diff --git a/.github/workflows/migrate-images.yml b/.github/workflows/migrate-images.yml
@@ -0,0 +1,48 @@
+name: Migrate Material Images
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - 'data/materials/**'
+  push:
+    branches: [main]
+    paths:
+      - 'data/materials/**'
+
+jobs:
+  migrate-images:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Dry run – show changed material files (PR)
+        if: github.event_name == 'pull_request'
+        run: |
+          python scripts/migrate_images.py \
+            --dry-run \
+            --base-ref origin/${{ github.base_ref }}
+
+      - name: Migrate images (push to main)
+        if: github.event_name == 'push'
+        env:
+          GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcs-credentials.json
+        run: |
+          echo '${{ secrets.GCS_CREDENTIALS_JSON }}' > /tmp/gcs-credentials.json
+          python scripts/migrate_images.py \
+            --no-dry-run \
+            --base-ref HEAD~1
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,10 @@ __pycache__/
 openprinttag/
 db-export.json
 
+# Allow local image cache used by migrate-images.py
+!data/tmp/
+!data/tmp/**
+
 # Python build artifacts
 *.egg-info/
 *.egg
diff --git a/data/materials/generic/pla-army-green.yaml b/data/materials/generic/pla-army-green.yaml
@@ -0,0 +1,19 @@
+name: PLA Army green
+slug: pla-army-green
+brand:
+  slug: generic
+class: FFF
+type: PLA
+abbreviation: ''
+tags: []
+certifications: []
+primary_color:
+  color_rgba: '#1f6b20ff'
+secondary_colors: []
+photos:
+- url: /tmp/assets/generic/pla-army-green/20260205110532.jpg
+  type: unspecified
+- url: http://example.com/generic/pla-army-green/20260205110342.jpg
+  type: unspecified
+properties: {}
+uuid: c6753d1b-4618-505a-aa5a-0676e55e9545
diff --git a/data/tmp/assets/generic/pla-army-green/20260205110600.jpg b/data/tmp/assets/generic/pla-army-green/20260205110600.jpg
diff --git a/scripts/migrate_images.py b/scripts/migrate_images.py
@@ -5,7 +5,7 @@
 This script:
 1. Scans all material YAML files in data/materials/
 2. Extracts image URLs from the 'photos' field
-3. Downloads images and saves them to assets/tmp/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
+3. Downloads images and saves them to tmp/assets/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
 4. Uploads images to Google Cloud Storage
 5. Updates YAML files with new public URLs
 
@@ -14,12 +14,13 @@
 Or standard GCS authentication via gcloud
 """
 
+import argparse
 import os
+import subprocess
 import sys
 import yaml
 import requests
 from pathlib import Path
-from urllib.parse import urlparse
 from google.cloud import storage
 
 
@@ -29,10 +30,14 @@ class MaterialImageMigration:
     PUBLIC_URL_BASE = "https://files.openprinttag.org"
 
     def __init__(
-        self, materials_dir: str = "data/materials", output_dir: str = "assets/tmp"
+        self,
+        materials_dir: str = "data/materials",
+        output_dir: str = "tmp/assets",
+        dry_run: bool = True,
     ):
         self.materials_dir = Path(materials_dir)
         self.output_dir = Path(output_dir)
+        self.dry_run = dry_run
         self.stats = {
             "total_materials": 0,
             "materials_with_photos": 0,
@@ -45,6 +50,12 @@ def __init__(
             "yaml_updated": 0,
             "yaml_update_failed": 0,
         }
+        self.missing_files: list[str] = []
+
+        if dry_run:
+            self.storage_client = None
+            self.bucket = None
+            return
 
         # Initialize GCS client
         try:
@@ -58,8 +69,15 @@ def __init__(
             )
             sys.exit(1)
 
-    def run(self):
-        """Main execution method."""
+    def run(self, files: list[Path] | None = None):
+        """Main execution method.
+
+        Args:
+            files: Optional list of specific YAML files to process.
+                   When None, all files in materials_dir are scanned.
+        """
+        if self.dry_run:
+            print("DRY RUN – no files will be downloaded, uploaded, or modified.")
         print("Starting material image migration...")
         print(f"Materials directory: {self.materials_dir}")
         print(f"Output directory: {self.output_dir}")
@@ -72,16 +90,31 @@ def run(self):
         # Create output directory
         self.output_dir.mkdir(parents=True, exist_ok=True)
 
-        # Process all brand directories
-        for brand_dir in sorted(self.materials_dir.iterdir()):
-            if not brand_dir.is_dir():
-                continue
-
-            brand_slug = brand_dir.name
-            self._process_brand(brand_slug, brand_dir)
+        if files is not None:
+            # Process only the given files
+            if not files:
+                print("No material files to process.")
+                return
+            for material_file in sorted(files):
+                brand_slug = material_file.parent.name
+                self._process_material(brand_slug, material_file)
+        else:
+            # Process all brand directories
+            for brand_dir in sorted(self.materials_dir.iterdir()):
+                if not brand_dir.is_dir():
+                    continue
+                brand_slug = brand_dir.name
+                self._process_brand(brand_slug, brand_dir)
 
         self._print_summary()
 
+        if self.dry_run and self.missing_files:
+            print("\nMISSING FILES – the following sources were not found:")
+            for path in self.missing_files:
+                print(f"  ✗  {path}")
+            print(f"\n{len(self.missing_files)} missing file(s). Fix the issues above before running the actual migration.")
+            sys.exit(1)
+
     def _process_brand(self, brand_slug: str, brand_dir: Path):
         """Process all materials for a given brand."""
         print(f"\nProcessing brand: {brand_slug}")
@@ -155,7 +188,7 @@ def _process_material(self, brand_slug: str, material_file: Path):
                         urls_changed = True
 
             # Write back updated YAML if any URLs changed
-            if urls_changed:
+            if urls_changed and not self.dry_run:
                 self._update_yaml_file(material_file, data)
 
         except Exception as e:
@@ -174,38 +207,73 @@ def _process_image(
         self.stats["total_photos"] += 1
 
         try:
-            # Extract filename from URL
-            parsed_url = urlparse(url)
-            filename = os.path.basename(parsed_url.path)
+            # Detect whether this is a local file path or a remote URL
+            is_local = not url.startswith(("http://", "https://"))
 
+            # Extract filename from path/URL
+            filename = os.path.basename(url)
             if not filename:
                 filename = f"image_{index}.jpg"
 
-            output_path = output_dir / filename
+            output_path = Path(url) if is_local else output_dir / filename
 
             # Check if already uploaded to new location
             new_url = f"{self.PUBLIC_URL_BASE}/{brand_slug}/{material_slug}/{filename}"
             if url == new_url:
                 print(f"    ✓  Already migrated: {filename}")
                 return url
 
-            # Download if not exists locally
-            if not output_path.exists():
-                print(f"    ⬇  Downloading: {filename}")
-                response = requests.get(url, timeout=30, stream=True)
-                response.raise_for_status()
-
-                # Save to file and count bytes
-                total_bytes = 0
-                with open(output_path, "wb") as f:
-                    for chunk in response.iter_content(chunk_size=8192):
-                        f.write(chunk)
-                        total_bytes += len(chunk)
-
-                print(f"    ✓  Downloaded: {filename} ({total_bytes} bytes)")
-                self.stats["downloaded"] += 1
-            else:
+            if self.dry_run:
+                if is_local:
+                    if output_path.exists():
+                        print(f"    📁 Would upload (local): {output_path} → {new_url}")
+                        self.stats["skipped"] += 1
+                    else:
+                        print(f"    ✗  Local file not found: {output_path}")
+                        self.stats["failed"] += 1
+                        self.missing_files.append(str(output_path))
+                else:
+                    # HEAD request to verify remote file exists
+                    try:
+                        response = requests.head(url, timeout=10, allow_redirects=True)
+                        if response.ok:
+                            print(f"    ✓  Exists (remote): {filename} → would upload to {new_url}")
+                            self.stats["skipped"] += 1
+                        else:
+                            print(f"    ✗  Remote file not found (HTTP {response.status_code}): {url}")
+                            self.stats["failed"] += 1
+                            self.missing_files.append(url)
+                    except requests.exceptions.RequestException as e:
+                        print(f"    ✗  Cannot reach remote file: {url}: {e}")
+                        self.stats["failed"] += 1
+                        self.missing_files.append(url)
+                return None
+
+            if is_local:
+                # Local path – skip download, upload directly
+                if not output_path.exists():
+                    print(f"    ✗  Local file not found: {output_path}")
+                    self.stats["failed"] += 1
+                    return None
                 self.stats["skipped"] += 1
+            else:
+                # Download if not exists locally
+                if not output_path.exists():
+                    print(f"    ⬇  Downloading: {filename}")
+                    response = requests.get(url, timeout=30, stream=True)
+                    response.raise_for_status()
+
+                    # Save to file and count bytes
+                    total_bytes = 0
+                    with open(output_path, "wb") as f:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            f.write(chunk)
+                            total_bytes += len(chunk)
+
+                    print(f"    ✓  Downloaded: {filename} ({total_bytes} bytes)")
+                    self.stats["downloaded"] += 1
+                else:
+                    self.stats["skipped"] += 1
 
             # Upload to Google Cloud Storage
             gcs_path = f"{brand_slug}/{material_slug}/{filename}"
@@ -273,10 +341,77 @@ def _print_summary(self):
         print("=" * 60)
 
 
+def _get_changed_files(base_ref: str, materials_dir: Path) -> list[Path]:
+    """Return list of changed YAML files in materials_dir since base_ref."""
+    try:
+        result = subprocess.run(
+            ["git", "diff", "--name-only", "--diff-filter=ACMR", f"{base_ref}...HEAD",
+             "--", str(materials_dir)],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"ERROR: git diff failed: {e.stderr.strip()}")
+        sys.exit(1)
+
+    paths = [
+        Path(line)
+        for line in result.stdout.splitlines()
+        if line.endswith(".yaml")
+    ]
+    return paths
+
+
 def main():
     """Entry point."""
-    migration = MaterialImageMigration()
-    migration.run()
+    parser = argparse.ArgumentParser(description="Migrate material images to GCS.")
+    parser.add_argument(
+        "--dry-run",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="List files that would be uploaded without making any changes (default: on).",
+    )
+    parser.add_argument(
+        "--base-ref",
+        metavar="REF",
+        help="Git ref to diff against (e.g. origin/main or HEAD~1). "
+             "Only changed YAML files in data/materials/ will be processed.",
+    )
+    parser.add_argument(
+        "--files",
+        nargs="+",
+        metavar="FILE",
+        help="Explicit list of YAML files to process (used when --base-ref is not set).",
+    )
+    parser.add_argument(
+        "--materials-dir",
+        default="data/materials",
+        metavar="DIR",
+        help="Path to materials directory (default: data/materials).",
+    )
+    args = parser.parse_args()
+
+    migration = MaterialImageMigration(
+        materials_dir=args.materials_dir,
+        dry_run=args.dry_run,
+    )
+
+    files: list[Path] | None = None
+
+    if args.base_ref:
+        files = _get_changed_files(args.base_ref, migration.materials_dir)
+        if not files:
+            print(f"No changed material files found against {args.base_ref}. Nothing to do.")
+            return
+        print(f"Changed files ({len(files)}) against {args.base_ref}:")
+        for f in files:
+            print(f"  {f}")
+        print()
+    elif args.files:
+        files = [Path(f) for f in args.files]
+
+    migration.run(files=files)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_migrate.py b/tests/test_migrate.py