diff --git a/.github/workflows/migrate-images.yml b/.github/workflows/migrate-images.yml new file mode 100644 index 0000000000..a575f888fc --- /dev/null +++ b/.github/workflows/migrate-images.yml @@ -0,0 +1,60 @@ +name: Migrate Material Images + +on: + pull_request: + branches: [main] + paths: + - 'data/materials/**' + push: + branches: [main] + paths: + - 'data/materials/**' + +jobs: + migrate-images: + runs-on: ubuntu-24.04 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.PAT_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + + - name: Dry run – show changed material files (PR) + if: github.event_name == 'pull_request' + run: | + python scripts/migrate_images.py \ + --dry-run \ + --base-ref origin/${{ github.base_ref }} + + - name: Migrate images (push to main) + if: github.event_name == 'push' + env: + GOOGLE_APPLICATION_CREDENTIALS: /tmp/gcs-credentials.json + run: | + echo '${{ secrets.GCS_CREDENTIALS_JSON }}' > /tmp/gcs-credentials.json + python scripts/migrate_images.py \ + --no-dry-run \ + --base-ref HEAD~1 + + - name: Remove temporary assets and commit (push to main) + if: github.event_name == 'push' + run: | + if [ -d data/tmp ]; then + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git rm -rf --ignore-unmatch data/tmp/ + git commit -m "chore: remove temporary assets after image migration [skip ci]" || echo "Nothing to commit" + git push + fi diff --git a/.gitignore b/.gitignore index d0f63f1cf2..579df6d9b6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ __pycache__/ openprinttag/ db-export.json +# Allow local image cache used by migrate-images.py +!data/tmp/ +!data/tmp/** + # Python build artifacts *.egg-info/ *.egg diff --git a/data/materials/generic/pla-army-green.yaml b/data/materials/generic/pla-army-green.yaml new file mode 100644 index 0000000000..0c4410a45f --- /dev/null +++ b/data/materials/generic/pla-army-green.yaml @@ -0,0 +1,17 @@ +name: PLA Army green +slug: pla-army-green +brand: + slug: generic +class: FFF +type: PLA +abbreviation: '' +tags: [] +certifications: [] +primary_color: + color_rgba: '#1f6b20ff' +secondary_colors: [] +photos: +- url: /tmp/assets/generic/pla-army-green/20260205110532.jpg + type: unspecified +properties: {} +uuid: c6753d1b-4618-505a-aa5a-0676e55e9545 diff --git a/data/tmp/assets/generic/pla-army-green/20260205110532.jpg b/data/tmp/assets/generic/pla-army-green/20260205110532.jpg new file mode 100644 index 0000000000..372877c4d2 Binary files /dev/null and b/data/tmp/assets/generic/pla-army-green/20260205110532.jpg differ diff --git a/data/tmp/assets/generic/pla-army-green/20260205110600.jpg b/data/tmp/assets/generic/pla-army-green/20260205110600.jpg new file mode 100644 index 0000000000..c4a070dd39 Binary files /dev/null and b/data/tmp/assets/generic/pla-army-green/20260205110600.jpg differ diff --git a/pyproject.toml b/pyproject.toml index 4bb85fbe43..450ba12131 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,8 @@ dependencies = [ "PyYAML>=6.0", "jsonschema>=4.0.0", "referencing>=0.37.0", + "requests>=2.31.0", + "google-cloud-storage>=2.10.0", ] [tool.setuptools.packages.find] diff --git a/scripts/migrate_images.py b/scripts/migrate_images.py new file mode 100755 index 0000000000..3e383785f4 --- /dev/null +++ b/scripts/migrate_images.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +""" +Migration script for downloading material images from YAML files. + +This script: +1. Scans all material YAML files in data/materials/ +2. Extracts image URLs from the 'photos' field +3. Downloads images and saves them to tmp/assets/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME +4. Uploads images to Google Cloud Storage +5. Updates YAML files with new public URLs + +Environment variables required: +- GOOGLE_APPLICATION_CREDENTIALS: Path to GCS service account JSON +Or standard GCS authentication via gcloud +""" + +import argparse +import os +import subprocess +import sys +import yaml +import requests +from pathlib import Path +from google.cloud import storage + + +class MaterialImageMigration: + # Google Cloud Storage configuration + GCS_BUCKET_NAME = "prusa3d-openprinttag-prod-3e31-material-db" + PUBLIC_URL_BASE = "https://files.openprinttag.org" + + def __init__( + self, + materials_dir: str = "data/materials", + output_dir: str = "tmp/assets", + dry_run: bool = True, + ): + self.materials_dir = Path(materials_dir) + self.data_dir = self.materials_dir.parent + self.output_dir = Path(output_dir) + self.dry_run = dry_run + self.stats = { + "total_materials": 0, + "materials_with_photos": 0, + "total_photos": 0, + "downloaded": 0, + "skipped": 0, + "failed": 0, + "uploaded": 0, + "upload_failed": 0, + "yaml_updated": 0, + "yaml_update_failed": 0, + } + self.missing_files: list[str] = [] + + if dry_run: + self.storage_client = None + self.bucket = None + return + + # Initialize GCS client + try: + self.storage_client = storage.Client() + self.bucket = self.storage_client.bucket(self.GCS_BUCKET_NAME) + print(f"✓ Connected to GCS bucket: {self.GCS_BUCKET_NAME}") + except Exception as e: + print(f"ERROR: Failed to initialize Google Cloud Storage client: {e}") + print( + "Make sure GOOGLE_APPLICATION_CREDENTIALS is set or you're authenticated via gcloud" + ) + sys.exit(1) + + def run(self, files: list[Path] | None = None): + """Main execution method. + + Args: + files: Optional list of specific YAML files to process. + When None, all files in materials_dir are scanned. + """ + if self.dry_run: + print("DRY RUN – no files will be downloaded, uploaded, or modified.") + print("Starting material image migration...") + print(f"Materials directory: {self.materials_dir}") + print(f"Output directory: {self.output_dir}") + print("-" * 60) + + if not self.materials_dir.exists(): + print(f"ERROR: Materials directory does not exist: {self.materials_dir}") + sys.exit(1) + + # Create output directory + self.output_dir.mkdir(parents=True, exist_ok=True) + + if files is not None: + # Process only the given files + if not files: + print("No material files to process.") + return + for material_file in sorted(files): + brand_slug = material_file.parent.name + self._process_material(brand_slug, material_file) + else: + # Process all brand directories + for brand_dir in sorted(self.materials_dir.iterdir()): + if not brand_dir.is_dir(): + continue + brand_slug = brand_dir.name + self._process_brand(brand_slug, brand_dir) + + self._print_summary() + + if self.dry_run and self.missing_files: + print("\nMISSING FILES – the following sources were not found:") + for path in self.missing_files: + print(f" ✗ {path}") + print(f"\n{len(self.missing_files)} missing file(s). Fix the issues above before running the actual migration.") + sys.exit(1) + + def _process_brand(self, brand_slug: str, brand_dir: Path): + """Process all materials for a given brand.""" + print(f"\nProcessing brand: {brand_slug}") + + for material_file in sorted(brand_dir.glob("*.yaml")): + self._process_material(brand_slug, material_file) + + def _process_material(self, brand_slug: str, material_file: Path): + """Process a single material YAML file.""" + self.stats["total_materials"] += 1 + + try: + with open(material_file, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + + if not data: + return + + material_slug = data.get("slug") + if not material_slug: + print(f" WARNING: No slug found in {material_file}") + return + + photos = data.get("photos", []) + if not photos: + return + + # Check if all URLs are already migrated + all_migrated = True + for photo in photos: + if isinstance(photo, dict): + url = photo.get("url", "") + else: + url = photo or "" + + if not url.startswith(self.PUBLIC_URL_BASE): + all_migrated = False + break + + if all_migrated: + print(f" ⏭ Material already migrated: {material_slug}") + return + + self.stats["materials_with_photos"] += 1 + print(f" Material: {material_slug} ({len(photos)} photo(s))") + + # Create material directory + material_output_dir = self.output_dir / brand_slug / material_slug + material_output_dir.mkdir(parents=True, exist_ok=True) + + # Track if any URLs changed + urls_changed = False + + # Download, upload, and update each photo + for idx, photo in enumerate(photos): + if isinstance(photo, dict): + old_url = photo.get("url") + else: + old_url = photo + + if old_url: + new_url = self._process_image( + old_url, brand_slug, material_slug, material_output_dir, idx + ) + if new_url and new_url != old_url: + # Update URL in data structure + if isinstance(photo, dict): + photo["url"] = new_url + else: + photos[idx] = new_url + urls_changed = True + + # Write back updated YAML if any URLs changed + if urls_changed and not self.dry_run: + self._update_yaml_file(material_file, data) + + except Exception as e: + print(f" ERROR processing {material_file}: {e}") + self.stats["failed"] += 1 + + def _process_image( + self, + url: str, + brand_slug: str, + material_slug: str, + output_dir: Path, + index: int, + ) -> str | None: + """Download, upload to GCS, and return new public URL.""" + self.stats["total_photos"] += 1 + + try: + # Detect whether this is a local file path or a remote URL + is_local = not url.startswith(("http://", "https://")) + + # Extract filename from path/URL + filename = os.path.basename(url) + if not filename: + filename = f"image_{index}.jpg" + + output_path = self.data_dir / url.lstrip("/") if is_local else output_dir / filename + + # Check if already uploaded to new location + new_url = f"{self.PUBLIC_URL_BASE}/{brand_slug}/{material_slug}/{filename}" + if url == new_url: + print(f" ✓ Already migrated: {filename}") + return url + + if self.dry_run: + if is_local: + if output_path.exists(): + print(f" 📁 Would upload (local): {output_path} → {new_url}") + self.stats["skipped"] += 1 + else: + print(f" ✗ Local file not found: {output_path}") + self.stats["failed"] += 1 + self.missing_files.append(str(output_path)) + else: + # HEAD request to verify remote file exists + try: + response = requests.head(url, timeout=10, allow_redirects=True) + if response.ok: + print(f" ✓ Exists (remote): {filename} → would upload to {new_url}") + self.stats["skipped"] += 1 + else: + print(f" ✗ Remote file not found (HTTP {response.status_code}): {url}") + self.stats["failed"] += 1 + self.missing_files.append(url) + except requests.exceptions.RequestException as e: + print(f" ✗ Cannot reach remote file: {url}: {e}") + self.stats["failed"] += 1 + self.missing_files.append(url) + return None + + if is_local: + # Local path – skip download, upload directly + if not output_path.exists(): + print(f" ✗ Local file not found: {output_path}") + self.stats["failed"] += 1 + return None + self.stats["skipped"] += 1 + else: + # Download if not exists locally + if not output_path.exists(): + print(f" ⬇ Downloading: {filename}") + response = requests.get(url, timeout=30, stream=True) + response.raise_for_status() + + # Save to file and count bytes + total_bytes = 0 + with open(output_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + total_bytes += len(chunk) + + print(f" ✓ Downloaded: {filename} ({total_bytes} bytes)") + self.stats["downloaded"] += 1 + else: + self.stats["skipped"] += 1 + + # Upload to Google Cloud Storage + gcs_path = f"{brand_slug}/{material_slug}/{filename}" + blob = self.bucket.blob(gcs_path) + + # Check if already exists in GCS + if blob.exists(): + print(f" ⏭ Already in GCS: {gcs_path}") + return new_url + + print(f" ⬆ Uploading to GCS: {gcs_path}") + blob.upload_from_filename(str(output_path)) + + # Make blob publicly accessible + blob.make_public() + + print(f" ✓ Uploaded to GCS: {new_url}") + self.stats["uploaded"] += 1 + + return new_url + + except requests.exceptions.RequestException as e: + print(f" ✗ Failed to download {url}: {e}") + self.stats["failed"] += 1 + return None + except Exception as e: + print(f" ✗ Error processing {url}: {e}") + self.stats["upload_failed"] += 1 + return None + + def _update_yaml_file(self, yaml_file: Path, data: dict): + """Update YAML file with new data.""" + try: + with open(yaml_file, "w", encoding="utf-8") as f: + yaml.dump( + data, + f, + allow_unicode=True, + sort_keys=False, + default_flow_style=False, + ) + + print(f" ✓ Updated YAML: {yaml_file.name}") + self.stats["yaml_updated"] += 1 + + except Exception as e: + print(f" ✗ Failed to update YAML {yaml_file}: {e}") + self.stats["yaml_update_failed"] += 1 + + def _print_summary(self): + """Print migration summary.""" + print("\n" + "=" * 60) + print("MIGRATION SUMMARY") + print("=" * 60) + print(f"Total materials scanned: {self.stats['total_materials']}") + print(f"Materials with photos: {self.stats['materials_with_photos']}") + print(f"Total photos found: {self.stats['total_photos']}") + print(f"Successfully downloaded: {self.stats['downloaded']}") + print(f"Skipped (already local): {self.stats['skipped']}") + print(f"Download failed: {self.stats['failed']}") + print(f"Uploaded to GCS: {self.stats['uploaded']}") + print(f"Upload failed: {self.stats['upload_failed']}") + print(f"YAML files updated: {self.stats['yaml_updated']}") + print(f"YAML update failed: {self.stats['yaml_update_failed']}") + print("=" * 60) + + +def _get_changed_files(base_ref: str, materials_dir: Path) -> list[Path]: + """Return list of changed YAML files in materials_dir since base_ref.""" + try: + result = subprocess.run( + ["git", "diff", "--name-only", "--diff-filter=ACMR", f"{base_ref}...HEAD", + "--", str(materials_dir)], + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + print(f"ERROR: git diff failed: {e.stderr.strip()}") + sys.exit(1) + + paths = [ + Path(line) + for line in result.stdout.splitlines() + if line.endswith(".yaml") + ] + return paths + + +def main(): + """Entry point.""" + parser = argparse.ArgumentParser(description="Migrate material images to GCS.") + parser.add_argument( + "--dry-run", + action=argparse.BooleanOptionalAction, + default=True, + help="List files that would be uploaded without making any changes (default: on).", + ) + parser.add_argument( + "--base-ref", + metavar="REF", + help="Git ref to diff against (e.g. origin/main or HEAD~1). " + "Only changed YAML files in data/materials/ will be processed.", + ) + parser.add_argument( + "--files", + nargs="+", + metavar="FILE", + help="Explicit list of YAML files to process (used when --base-ref is not set).", + ) + parser.add_argument( + "--materials-dir", + default="data/materials", + metavar="DIR", + help="Path to materials directory (default: data/materials).", + ) + args = parser.parse_args() + + migration = MaterialImageMigration( + materials_dir=args.materials_dir, + dry_run=args.dry_run, + ) + + files: list[Path] | None = None + + if args.base_ref: + files = _get_changed_files(args.base_ref, migration.materials_dir) + if not files: + print(f"No changed material files found against {args.base_ref}. Nothing to do.") + return + print(f"Changed files ({len(files)}) against {args.base_ref}:") + for f in files: + print(f" {f}") + print() + elif args.files: + files = [Path(f) for f in args.files] + + migration.run(files=files) + + +if __name__ == "__main__": + main() diff --git a/tests/test_migrate.py b/tests/test_migrate.py new file mode 100644 index 0000000000..8625890c3e --- /dev/null +++ b/tests/test_migrate.py @@ -0,0 +1,225 @@ +""" +Tests for migrate.py script - idempotency and URL handling. +""" + +import unittest +import tempfile +import yaml +from pathlib import Path +from unittest.mock import Mock, patch, MagicMock +import sys + +# Add scripts directory to path +sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) +from migrate_images import MaterialImageMigration + + +class TestMigrateIdempotency(unittest.TestCase): + """Test that migrate script is idempotent and skips already migrated materials.""" + + def setUp(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.materials_dir = Path(self.temp_dir) / "materials" + self.output_dir = Path(self.temp_dir) / "output" + self.materials_dir.mkdir(parents=True) + + def test_skip_already_migrated_material(self): + """Test that materials with new URLs are skipped.""" + # Create test brand directory + brand_dir = self.materials_dir / "test-brand" + brand_dir.mkdir() + + # Create material with already migrated URL + material_file = brand_dir / "test-material.yaml" + material_data = { + "uuid": "test-uuid", + "slug": "test-material", + "brand": {"slug": "test-brand"}, + "name": "Test Material", + "photos": [ + { + "url": "https://files.openprinttag.org/test-brand/test-material/image.jpg", + "type": "unspecified" + } + ] + } + + with open(material_file, 'w') as f: + yaml.dump(material_data, f) + + # Mock GCS client to avoid real connection + with patch('migrate_images.storage.Client') as mock_storage: + mock_client = MagicMock() + mock_storage.return_value = mock_client + mock_bucket = MagicMock() + mock_client.bucket.return_value = mock_bucket + + # Create migration instance + migration = MaterialImageMigration( + materials_dir=str(self.materials_dir), + output_dir=str(self.output_dir) + ) + + # Process the material + migration._process_material("test-brand", material_file) + + # Material should be skipped (not counted in materials_with_photos) + self.assertEqual(migration.stats["materials_with_photos"], 0) + self.assertEqual(migration.stats["total_materials"], 1) + + def test_process_not_migrated_material(self): + """Test that materials with old URLs are processed.""" + # Create test brand directory + brand_dir = self.materials_dir / "test-brand" + brand_dir.mkdir() + + # Create material with old URL + material_file = brand_dir / "test-material.yaml" + material_data = { + "uuid": "test-uuid", + "slug": "test-material", + "brand": {"slug": "test-brand"}, + "name": "Test Material", + "photos": [ + { + "url": "https://old-server.com/image.jpg", + "type": "unspecified" + } + ] + } + + with open(material_file, 'w') as f: + yaml.dump(material_data, f) + + # Mock GCS client and requests + with patch('migrate_images.storage.Client') as mock_storage, \ + patch('migrate_images.requests.get') as mock_get: + + # Setup GCS mocks + mock_client = MagicMock() + mock_storage.return_value = mock_client + mock_bucket = MagicMock() + mock_client.bucket.return_value = mock_bucket + mock_blob = MagicMock() + mock_bucket.blob.return_value = mock_blob + mock_blob.exists.return_value = False + + # Setup requests mock + mock_response = MagicMock() + mock_response.iter_content.return_value = [b"test image data"] + mock_get.return_value = mock_response + + # Create migration instance + migration = MaterialImageMigration( + materials_dir=str(self.materials_dir), + output_dir=str(self.output_dir) + ) + + # Process the material + migration._process_material("test-brand", material_file) + + # Material should be processed + self.assertEqual(migration.stats["materials_with_photos"], 1) + self.assertEqual(migration.stats["total_materials"], 1) + + def test_mixed_urls_in_material(self): + """Test material with mix of old and new URLs.""" + # Create test brand directory + brand_dir = self.materials_dir / "test-brand" + brand_dir.mkdir() + + # Create material with mixed URLs + material_file = brand_dir / "test-material.yaml" + material_data = { + "uuid": "test-uuid", + "slug": "test-material", + "brand": {"slug": "test-brand"}, + "name": "Test Material", + "photos": [ + { + "url": "https://files.openprinttag.org/test-brand/test-material/image1.jpg", + "type": "unspecified" + }, + { + "url": "https://old-server.com/image2.jpg", + "type": "unspecified" + } + ] + } + + with open(material_file, 'w') as f: + yaml.dump(material_data, f) + + # Mock GCS client and requests + with patch('migrate_images.storage.Client') as mock_storage, \ + patch('migrate_images.requests.get') as mock_get: + + # Setup GCS mocks + mock_client = MagicMock() + mock_storage.return_value = mock_client + mock_bucket = MagicMock() + mock_client.bucket.return_value = mock_bucket + mock_blob = MagicMock() + mock_bucket.blob.return_value = mock_blob + mock_blob.exists.return_value = False + + # Setup requests mock + mock_response = MagicMock() + mock_response.iter_content.return_value = [b"test image data"] + mock_get.return_value = mock_response + + # Create migration instance + migration = MaterialImageMigration( + materials_dir=str(self.materials_dir), + output_dir=str(self.output_dir) + ) + + # Process the material + migration._process_material("test-brand", material_file) + + # Material should be processed (not all URLs are migrated) + self.assertEqual(migration.stats["materials_with_photos"], 1) + self.assertEqual(migration.stats["total_materials"], 1) + + def test_no_photos_material(self): + """Test material without photos.""" + # Create test brand directory + brand_dir = self.materials_dir / "test-brand" + brand_dir.mkdir() + + # Create material without photos + material_file = brand_dir / "test-material.yaml" + material_data = { + "uuid": "test-uuid", + "slug": "test-material", + "brand": {"slug": "test-brand"}, + "name": "Test Material", + } + + with open(material_file, 'w') as f: + yaml.dump(material_data, f) + + # Mock GCS client + with patch('migrate_images.storage.Client') as mock_storage: + mock_client = MagicMock() + mock_storage.return_value = mock_client + mock_bucket = MagicMock() + mock_client.bucket.return_value = mock_bucket + + # Create migration instance + migration = MaterialImageMigration( + materials_dir=str(self.materials_dir), + output_dir=str(self.output_dir) + ) + + # Process the material + migration._process_material("test-brand", material_file) + + # Material should be counted but not processed + self.assertEqual(migration.stats["materials_with_photos"], 0) + self.assertEqual(migration.stats["total_materials"], 1) + + +if __name__ == '__main__': + unittest.main()