55This script:
661. Scans all material YAML files in data/materials/
772. Extracts image URLs from the 'photos' field
8- 3. Downloads images and saves them to assets/ tmp/BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
8+ 3. Downloads images and saves them to tmp/assets /BRAND_SLUG/MATERIAL_SLUG/IMG_NAME
994. Uploads images to Google Cloud Storage
10105. Updates YAML files with new public URLs
1111
1414Or standard GCS authentication via gcloud
1515"""
1616
17+ import argparse
1718import os
19+ import subprocess
1820import sys
1921import yaml
2022import requests
2123from pathlib import Path
22- from urllib .parse import urlparse
2324from google .cloud import storage
2425
2526
@@ -29,10 +30,15 @@ class MaterialImageMigration:
2930 PUBLIC_URL_BASE = "https://files.openprinttag.org"
3031
3132 def __init__ (
32- self , materials_dir : str = "data/materials" , output_dir : str = "assets/tmp"
33+ self ,
34+ materials_dir : str = "data/materials" ,
35+ output_dir : str = "tmp/assets" ,
36+ dry_run : bool = True ,
3337 ):
3438 self .materials_dir = Path (materials_dir )
39+ self .data_dir = self .materials_dir .parent
3540 self .output_dir = Path (output_dir )
41+ self .dry_run = dry_run
3642 self .stats = {
3743 "total_materials" : 0 ,
3844 "materials_with_photos" : 0 ,
@@ -45,6 +51,12 @@ def __init__(
4551 "yaml_updated" : 0 ,
4652 "yaml_update_failed" : 0 ,
4753 }
54+ self .missing_files : list [str ] = []
55+
56+ if dry_run :
57+ self .storage_client = None
58+ self .bucket = None
59+ return
4860
4961 # Initialize GCS client
5062 try :
@@ -58,8 +70,15 @@ def __init__(
5870 )
5971 sys .exit (1 )
6072
61- def run (self ):
62- """Main execution method."""
73+ def run (self , files : list [Path ] | None = None ):
74+ """Main execution method.
75+
76+ Args:
77+ files: Optional list of specific YAML files to process.
78+ When None, all files in materials_dir are scanned.
79+ """
80+ if self .dry_run :
81+ print ("DRY RUN – no files will be downloaded, uploaded, or modified." )
6382 print ("Starting material image migration..." )
6483 print (f"Materials directory: { self .materials_dir } " )
6584 print (f"Output directory: { self .output_dir } " )
@@ -72,16 +91,31 @@ def run(self):
7291 # Create output directory
7392 self .output_dir .mkdir (parents = True , exist_ok = True )
7493
75- # Process all brand directories
76- for brand_dir in sorted (self .materials_dir .iterdir ()):
77- if not brand_dir .is_dir ():
78- continue
79-
80- brand_slug = brand_dir .name
81- self ._process_brand (brand_slug , brand_dir )
94+ if files is not None :
95+ # Process only the given files
96+ if not files :
97+ print ("No material files to process." )
98+ return
99+ for material_file in sorted (files ):
100+ brand_slug = material_file .parent .name
101+ self ._process_material (brand_slug , material_file )
102+ else :
103+ # Process all brand directories
104+ for brand_dir in sorted (self .materials_dir .iterdir ()):
105+ if not brand_dir .is_dir ():
106+ continue
107+ brand_slug = brand_dir .name
108+ self ._process_brand (brand_slug , brand_dir )
82109
83110 self ._print_summary ()
84111
112+ if self .dry_run and self .missing_files :
113+ print ("\n MISSING FILES – the following sources were not found:" )
114+ for path in self .missing_files :
115+ print (f" ✗ { path } " )
116+ print (f"\n { len (self .missing_files )} missing file(s). Fix the issues above before running the actual migration." )
117+ sys .exit (1 )
118+
85119 def _process_brand (self , brand_slug : str , brand_dir : Path ):
86120 """Process all materials for a given brand."""
87121 print (f"\n Processing brand: { brand_slug } " )
@@ -155,7 +189,7 @@ def _process_material(self, brand_slug: str, material_file: Path):
155189 urls_changed = True
156190
157191 # Write back updated YAML if any URLs changed
158- if urls_changed :
192+ if urls_changed and not self . dry_run :
159193 self ._update_yaml_file (material_file , data )
160194
161195 except Exception as e :
@@ -174,38 +208,73 @@ def _process_image(
174208 self .stats ["total_photos" ] += 1
175209
176210 try :
177- # Extract filename from URL
178- parsed_url = urlparse (url )
179- filename = os .path .basename (parsed_url .path )
211+ # Detect whether this is a local file path or a remote URL
212+ is_local = not url .startswith (("http://" , "https://" ))
180213
214+ # Extract filename from path/URL
215+ filename = os .path .basename (url )
181216 if not filename :
182217 filename = f"image_{ index } .jpg"
183218
184- output_path = output_dir / filename
219+ output_path = self . data_dir / url . lstrip ( "/" ) if is_local else output_dir / filename
185220
186221 # Check if already uploaded to new location
187222 new_url = f"{ self .PUBLIC_URL_BASE } /{ brand_slug } /{ material_slug } /{ filename } "
188223 if url == new_url :
189224 print (f" ✓ Already migrated: { filename } " )
190225 return url
191226
192- # Download if not exists locally
193- if not output_path .exists ():
194- print (f" ⬇ Downloading: { filename } " )
195- response = requests .get (url , timeout = 30 , stream = True )
196- response .raise_for_status ()
197-
198- # Save to file and count bytes
199- total_bytes = 0
200- with open (output_path , "wb" ) as f :
201- for chunk in response .iter_content (chunk_size = 8192 ):
202- f .write (chunk )
203- total_bytes += len (chunk )
204-
205- print (f" ✓ Downloaded: { filename } ({ total_bytes } bytes)" )
206- self .stats ["downloaded" ] += 1
207- else :
227+ if self .dry_run :
228+ if is_local :
229+ if output_path .exists ():
230+ print (f" 📁 Would upload (local): { output_path } → { new_url } " )
231+ self .stats ["skipped" ] += 1
232+ else :
233+ print (f" ✗ Local file not found: { output_path } " )
234+ self .stats ["failed" ] += 1
235+ self .missing_files .append (str (output_path ))
236+ else :
237+ # HEAD request to verify remote file exists
238+ try :
239+ response = requests .head (url , timeout = 10 , allow_redirects = True )
240+ if response .ok :
241+ print (f" ✓ Exists (remote): { filename } → would upload to { new_url } " )
242+ self .stats ["skipped" ] += 1
243+ else :
244+ print (f" ✗ Remote file not found (HTTP { response .status_code } ): { url } " )
245+ self .stats ["failed" ] += 1
246+ self .missing_files .append (url )
247+ except requests .exceptions .RequestException as e :
248+ print (f" ✗ Cannot reach remote file: { url } : { e } " )
249+ self .stats ["failed" ] += 1
250+ self .missing_files .append (url )
251+ return None
252+
253+ if is_local :
254+ # Local path – skip download, upload directly
255+ if not output_path .exists ():
256+ print (f" ✗ Local file not found: { output_path } " )
257+ self .stats ["failed" ] += 1
258+ return None
208259 self .stats ["skipped" ] += 1
260+ else :
261+ # Download if not exists locally
262+ if not output_path .exists ():
263+ print (f" ⬇ Downloading: { filename } " )
264+ response = requests .get (url , timeout = 30 , stream = True )
265+ response .raise_for_status ()
266+
267+ # Save to file and count bytes
268+ total_bytes = 0
269+ with open (output_path , "wb" ) as f :
270+ for chunk in response .iter_content (chunk_size = 8192 ):
271+ f .write (chunk )
272+ total_bytes += len (chunk )
273+
274+ print (f" ✓ Downloaded: { filename } ({ total_bytes } bytes)" )
275+ self .stats ["downloaded" ] += 1
276+ else :
277+ self .stats ["skipped" ] += 1
209278
210279 # Upload to Google Cloud Storage
211280 gcs_path = f"{ brand_slug } /{ material_slug } /{ filename } "
@@ -273,10 +342,77 @@ def _print_summary(self):
273342 print ("=" * 60 )
274343
275344
345+ def _get_changed_files (base_ref : str , materials_dir : Path ) -> list [Path ]:
346+ """Return list of changed YAML files in materials_dir since base_ref."""
347+ try :
348+ result = subprocess .run (
349+ ["git" , "diff" , "--name-only" , "--diff-filter=ACMR" , f"{ base_ref } ...HEAD" ,
350+ "--" , str (materials_dir )],
351+ capture_output = True ,
352+ text = True ,
353+ check = True ,
354+ )
355+ except subprocess .CalledProcessError as e :
356+ print (f"ERROR: git diff failed: { e .stderr .strip ()} " )
357+ sys .exit (1 )
358+
359+ paths = [
360+ Path (line )
361+ for line in result .stdout .splitlines ()
362+ if line .endswith (".yaml" )
363+ ]
364+ return paths
365+
366+
276367def main ():
277368 """Entry point."""
278- migration = MaterialImageMigration ()
279- migration .run ()
369+ parser = argparse .ArgumentParser (description = "Migrate material images to GCS." )
370+ parser .add_argument (
371+ "--dry-run" ,
372+ action = argparse .BooleanOptionalAction ,
373+ default = True ,
374+ help = "List files that would be uploaded without making any changes (default: on)." ,
375+ )
376+ parser .add_argument (
377+ "--base-ref" ,
378+ metavar = "REF" ,
379+ help = "Git ref to diff against (e.g. origin/main or HEAD~1). "
380+ "Only changed YAML files in data/materials/ will be processed." ,
381+ )
382+ parser .add_argument (
383+ "--files" ,
384+ nargs = "+" ,
385+ metavar = "FILE" ,
386+ help = "Explicit list of YAML files to process (used when --base-ref is not set)." ,
387+ )
388+ parser .add_argument (
389+ "--materials-dir" ,
390+ default = "data/materials" ,
391+ metavar = "DIR" ,
392+ help = "Path to materials directory (default: data/materials)." ,
393+ )
394+ args = parser .parse_args ()
395+
396+ migration = MaterialImageMigration (
397+ materials_dir = args .materials_dir ,
398+ dry_run = args .dry_run ,
399+ )
400+
401+ files : list [Path ] | None = None
402+
403+ if args .base_ref :
404+ files = _get_changed_files (args .base_ref , migration .materials_dir )
405+ if not files :
406+ print (f"No changed material files found against { args .base_ref } . Nothing to do." )
407+ return
408+ print (f"Changed files ({ len (files )} ) against { args .base_ref } :" )
409+ for f in files :
410+ print (f" { f } " )
411+ print ()
412+ elif args .files :
413+ files = [Path (f ) for f in args .files ]
414+
415+ migration .run (files = files )
280416
281417
282418if __name__ == "__main__" :
0 commit comments