55This script:
661. Scans all material YAML files in data/materials/
772. Extracts image URLs from the 'photos' field
8- 3. Downloads images and saves them to assets/ tmp/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
8+ 3. Downloads images and saves them to tmp/assets /BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
994. Uploads images to Google Cloud Storage
10105. Updates YAML files with new public URLs
1111
1414Or standard GCS authentication via gcloud
1515"""
1616
17+ import argparse
1718import os
19+ import subprocess
1820import sys
1921import yaml
2022import requests
2123from pathlib import Path
22- from urllib .parse import urlparse
2324from google .cloud import storage
2425
2526
@@ -29,10 +30,14 @@ class MaterialImageMigration:
2930 PUBLIC_URL_BASE = "https://files.openprinttag.org"
3031
3132 def __init__ (
32- self , materials_dir : str = "data/materials" , output_dir : str = "assets/tmp"
33+ self ,
34+ materials_dir : str = "data/materials" ,
35+ output_dir : str = "tmp/assets" ,
36+ dry_run : bool = True ,
3337 ):
3438 self .materials_dir = Path (materials_dir )
3539 self .output_dir = Path (output_dir )
40+ self .dry_run = dry_run
3641 self .stats = {
3742 "total_materials" : 0 ,
3843 "materials_with_photos" : 0 ,
@@ -45,6 +50,12 @@ def __init__(
4550 "yaml_updated" : 0 ,
4651 "yaml_update_failed" : 0 ,
4752 }
53+ self .missing_files : list [str ] = []
54+
55+ if dry_run :
56+ self .storage_client = None
57+ self .bucket = None
58+ return
4859
4960 # Initialize GCS client
5061 try :
@@ -58,8 +69,15 @@ def __init__(
5869 )
5970 sys .exit (1 )
6071
61- def run (self ):
62- """Main execution method."""
72+ def run (self , files : list [Path ] | None = None ):
73+ """Main execution method.
74+
75+ Args:
76+ files: Optional list of specific YAML files to process.
77+ When None, all files in materials_dir are scanned.
78+ """
79+ if self .dry_run :
80+ print ("DRY RUN – no files will be downloaded, uploaded, or modified." )
6381 print ("Starting material image migration..." )
6482 print (f"Materials directory: { self .materials_dir } " )
6583 print (f"Output directory: { self .output_dir } " )
@@ -72,16 +90,31 @@ def run(self):
7290 # Create output directory
7391 self .output_dir .mkdir (parents = True , exist_ok = True )
7492
75- # Process all brand directories
76- for brand_dir in sorted (self .materials_dir .iterdir ()):
77- if not brand_dir .is_dir ():
78- continue
79-
80- brand_slug = brand_dir .name
81- self ._process_brand (brand_slug , brand_dir )
93+ if files is not None :
94+ # Process only the given files
95+ if not files :
96+ print ("No material files to process." )
97+ return
98+ for material_file in sorted (files ):
99+ brand_slug = material_file .parent .name
100+ self ._process_material (brand_slug , material_file )
101+ else :
102+ # Process all brand directories
103+ for brand_dir in sorted (self .materials_dir .iterdir ()):
104+ if not brand_dir .is_dir ():
105+ continue
106+ brand_slug = brand_dir .name
107+ self ._process_brand (brand_slug , brand_dir )
82108
83109 self ._print_summary ()
84110
111+ if self .dry_run and self .missing_files :
112+ print ("\n MISSING FILES – the following sources were not found:" )
113+ for path in self .missing_files :
114+ print (f" ✗ { path } " )
115+ print (f"\n { len (self .missing_files )} missing file(s). Fix the issues above before running the actual migration." )
116+ sys .exit (1 )
117+
85118 def _process_brand (self , brand_slug : str , brand_dir : Path ):
86119 """Process all materials for a given brand."""
87120 print (f"\n Processing brand: { brand_slug } " )
@@ -155,7 +188,7 @@ def _process_material(self, brand_slug: str, material_file: Path):
155188 urls_changed = True
156189
157190 # Write back updated YAML if any URLs changed
158- if urls_changed :
191+ if urls_changed and not self . dry_run :
159192 self ._update_yaml_file (material_file , data )
160193
161194 except Exception as e :
@@ -174,38 +207,73 @@ def _process_image(
174207 self .stats ["total_photos" ] += 1
175208
176209 try :
177- # Extract filename from URL
178- parsed_url = urlparse (url )
179- filename = os .path .basename (parsed_url .path )
210+ # Detect whether this is a local file path or a remote URL
211+ is_local = not url .startswith (("http://" , "https://" ))
180212
213+ # Extract filename from path/URL
214+ filename = os .path .basename (url )
181215 if not filename :
182216 filename = f"image_{ index } .jpg"
183217
184- output_path = output_dir / filename
218+ output_path = Path ( url ) if is_local else output_dir / filename
185219
186220 # Check if already uploaded to new location
187221 new_url = f"{ self .PUBLIC_URL_BASE } /{ brand_slug } /{ material_slug } /{ filename } "
188222 if url == new_url :
189223 print (f" ✓ Already migrated: { filename } " )
190224 return url
191225
192- # Download if not exists locally
193- if not output_path .exists ():
194- print (f" ⬇ Downloading: { filename } " )
195- response = requests .get (url , timeout = 30 , stream = True )
196- response .raise_for_status ()
197-
198- # Save to file and count bytes
199- total_bytes = 0
200- with open (output_path , "wb" ) as f :
201- for chunk in response .iter_content (chunk_size = 8192 ):
202- f .write (chunk )
203- total_bytes += len (chunk )
204-
205- print (f" ✓ Downloaded: { filename } ({ total_bytes } bytes)" )
206- self .stats ["downloaded" ] += 1
207- else :
226+ if self .dry_run :
227+ if is_local :
228+ if output_path .exists ():
229+ print (f" 📁 Would upload (local): { output_path } → { new_url } " )
230+ self .stats ["skipped" ] += 1
231+ else :
232+ print (f" ✗ Local file not found: { output_path } " )
233+ self .stats ["failed" ] += 1
234+ self .missing_files .append (str (output_path ))
235+ else :
236+ # HEAD request to verify remote file exists
237+ try :
238+ response = requests .head (url , timeout = 10 , allow_redirects = True )
239+ if response .ok :
240+ print (f" ✓ Exists (remote): { filename } → would upload to { new_url } " )
241+ self .stats ["skipped" ] += 1
242+ else :
243+ print (f" ✗ Remote file not found (HTTP { response .status_code } ): { url } " )
244+ self .stats ["failed" ] += 1
245+ self .missing_files .append (url )
246+ except requests .exceptions .RequestException as e :
247+ print (f" ✗ Cannot reach remote file: { url } : { e } " )
248+ self .stats ["failed" ] += 1
249+ self .missing_files .append (url )
250+ return None
251+
252+ if is_local :
253+ # Local path – skip download, upload directly
254+ if not output_path .exists ():
255+ print (f" ✗ Local file not found: { output_path } " )
256+ self .stats ["failed" ] += 1
257+ return None
208258 self .stats ["skipped" ] += 1
259+ else :
260+ # Download if not exists locally
261+ if not output_path .exists ():
262+ print (f" ⬇ Downloading: { filename } " )
263+ response = requests .get (url , timeout = 30 , stream = True )
264+ response .raise_for_status ()
265+
266+ # Save to file and count bytes
267+ total_bytes = 0
268+ with open (output_path , "wb" ) as f :
269+ for chunk in response .iter_content (chunk_size = 8192 ):
270+ f .write (chunk )
271+ total_bytes += len (chunk )
272+
273+ print (f" ✓ Downloaded: { filename } ({ total_bytes } bytes)" )
274+ self .stats ["downloaded" ] += 1
275+ else :
276+ self .stats ["skipped" ] += 1
209277
210278 # Upload to Google Cloud Storage
211279 gcs_path = f"{ brand_slug } /{ material_slug } /{ filename } "
@@ -273,10 +341,77 @@ def _print_summary(self):
273341 print ("=" * 60 )
274342
275343
344+ def _get_changed_files (base_ref : str , materials_dir : Path ) -> list [Path ]:
345+ """Return list of changed YAML files in materials_dir since base_ref."""
346+ try :
347+ result = subprocess .run (
348+ ["git" , "diff" , "--name-only" , "--diff-filter=ACMR" , f"{ base_ref } ...HEAD" ,
349+ "--" , str (materials_dir )],
350+ capture_output = True ,
351+ text = True ,
352+ check = True ,
353+ )
354+ except subprocess .CalledProcessError as e :
355+ print (f"ERROR: git diff failed: { e .stderr .strip ()} " )
356+ sys .exit (1 )
357+
358+ paths = [
359+ Path (line )
360+ for line in result .stdout .splitlines ()
361+ if line .endswith (".yaml" )
362+ ]
363+ return paths
364+
365+
276366def main ():
277367 """Entry point."""
278- migration = MaterialImageMigration ()
279- migration .run ()
368+ parser = argparse .ArgumentParser (description = "Migrate material images to GCS." )
369+ parser .add_argument (
370+ "--dry-run" ,
371+ action = argparse .BooleanOptionalAction ,
372+ default = True ,
373+ help = "List files that would be uploaded without making any changes (default: on)." ,
374+ )
375+ parser .add_argument (
376+ "--base-ref" ,
377+ metavar = "REF" ,
378+ help = "Git ref to diff against (e.g. origin/main or HEAD~1). "
379+ "Only changed YAML files in data/materials/ will be processed." ,
380+ )
381+ parser .add_argument (
382+ "--files" ,
383+ nargs = "+" ,
384+ metavar = "FILE" ,
385+ help = "Explicit list of YAML files to process (used when --base-ref is not set)." ,
386+ )
387+ parser .add_argument (
388+ "--materials-dir" ,
389+ default = "data/materials" ,
390+ metavar = "DIR" ,
391+ help = "Path to materials directory (default: data/materials)." ,
392+ )
393+ args = parser .parse_args ()
394+
395+ migration = MaterialImageMigration (
396+ materials_dir = args .materials_dir ,
397+ dry_run = args .dry_run ,
398+ )
399+
400+ files : list [Path ] | None = None
401+
402+ if args .base_ref :
403+ files = _get_changed_files (args .base_ref , migration .materials_dir )
404+ if not files :
405+ print (f"No changed material files found against { args .base_ref } . Nothing to do." )
406+ return
407+ print (f"Changed files ({ len (files )} ) against { args .base_ref } :" )
408+ for f in files :
409+ print (f" { f } " )
410+ print ()
411+ elif args .files :
412+ files = [Path (f ) for f in args .files ]
413+
414+ migration .run (files = files )
280415
281416
282417if __name__ == "__main__" :
0 commit comments