2020 --skip-dissolve Keep individual cell polygons (no merging by anthrome code);
2121 best if you need per-cell cartogram-style outputs; larger files.
2222 --profile Folder name under processing/geojson for outputs.
23+ --boundaries Path to Natural Earth shapefile for country lookup (optional).
24+ When provided, adds 'country' (ISO3) and 'cellId' to each feature.
2325"""
2426
2527import os
3436from rasterio .transform import from_origin
3537from rasterio .warp import reproject
3638from rasterio .crs import CRS
37- from shapely .geometry import shape , mapping
39+ from shapely .geometry import shape , mapping , Point , box
3840from shapely .geometry .polygon import orient
3941from shapely .ops import unary_union
42+ from shapely import STRtree
43+ import shapefile # pyshp
4044import warnings
4145
4246# Suppress rasterio warnings
4347warnings .filterwarnings ('ignore' , category = rasterio .errors .NotGeoreferencedWarning )
4448
49+ # Global for country lookup (loaded once)
50+ _country_index = None
51+ _country_geoms = None
52+ _country_codes = None
53+
4554def parse_args ():
4655 parser = argparse .ArgumentParser (description = 'Extract/dissolve anthrome GeoJSON from GeoTIFFs' )
4756 parser .add_argument ('--input-dir' , type = Path , default = Path ('../data/HYDE-3.5/baseline/anthromes_geotiff' ),
@@ -58,8 +67,82 @@ def parse_args():
5867 help = 'Minimum object size (in pixels) to retain during sieve (0 to skip)' )
5968 parser .add_argument ('--skip-dissolve' , action = 'store_true' ,
6069 help = 'Keep individual cell polygons (no dissolve by anthrome code)' )
70+ parser .add_argument ('--boundaries' , type = Path , default = None ,
71+ help = 'Path to Natural Earth shapefile for country lookup (adds cellId and country)' )
6172 return parser .parse_args ()
6273
74+
75+ def load_country_boundaries (shp_path ):
76+ """
77+ Load Natural Earth shapefile and build spatial index for country lookup.
78+ Returns (STRtree index, list of geometries, list of ISO3 codes).
79+ """
80+ global _country_index , _country_geoms , _country_codes
81+
82+ if _country_index is not None :
83+ return _country_index , _country_geoms , _country_codes
84+
85+ print (f" Loading country boundaries from { shp_path } ..." )
86+ sf = shapefile .Reader (str (shp_path ))
87+ fields = [f [0 ] for f in sf .fields [1 :]]
88+
89+ # Find ISO_A3 field
90+ iso_idx = fields .index ('ISO_A3' ) if 'ISO_A3' in fields else None
91+ if iso_idx is None :
92+ print (" Warning: ISO_A3 field not found, country lookup disabled" )
93+ return None , None , None
94+
95+ geoms = []
96+ codes = []
97+
98+ for rec in sf .shapeRecords ():
99+ geom = shape (rec .shape .__geo_interface__ )
100+ iso3 = rec .record [iso_idx ] if iso_idx is not None else None
101+ if geom .is_valid and iso3 :
102+ geoms .append (geom )
103+ codes .append (iso3 )
104+
105+ _country_geoms = geoms
106+ _country_codes = codes
107+ _country_index = STRtree (geoms )
108+
109+ print (f" Loaded { len (geoms )} country polygons" )
110+ return _country_index , _country_geoms , _country_codes
111+
112+
113+ def compute_cell_id (centroid , transform , num_cols ):
114+ """
115+ Compute deterministic cell ID from centroid coordinates and grid transform.
116+ cellId = row * num_cols + col
117+ """
118+ origin_x = transform .c # left edge
119+ origin_y = transform .f # top edge
120+ pixel_width = transform .a
121+ pixel_height = - transform .e # negative because y decreases downward
122+
123+ col = int ((centroid .x - origin_x ) / pixel_width )
124+ row = int ((origin_y - centroid .y ) / pixel_height )
125+
126+ return row * num_cols + col
127+
128+
129+ def lookup_country (centroid , index , geoms , codes ):
130+ """
131+ Look up country ISO3 code for a point using spatial index.
132+ Returns ISO3 code or None if not found.
133+ """
134+ if index is None :
135+ return None
136+
137+ # Query spatial index for candidates
138+ candidates = index .query (centroid )
139+
140+ for i in candidates :
141+ if geoms [i ].contains (centroid ):
142+ return codes [i ]
143+
144+ return None
145+
63146def orient_geometry (geom , sign = - 1.0 ):
64147 """
65148 Safely orient Polygon/MultiPolygon (and collections) without assuming .exterior exists.
@@ -101,7 +184,8 @@ def resample_categorical(src, target_res):
101184
102185 return destination , dst_transform
103186
104- def extract_geojson_from_tif (tif_path , output_path , target_res , simplify_tol , sieve_size , skip_dissolve ):
187+ def extract_geojson_from_tif (tif_path , output_path , target_res , simplify_tol , sieve_size , skip_dissolve ,
188+ country_index = None , country_geoms = None , country_codes = None ):
105189 """
106190 Extract GeoJSON polygons from a single GeoTIFF file with optimization.
107191
@@ -111,13 +195,19 @@ def extract_geojson_from_tif(tif_path, output_path, target_res, simplify_tol, si
111195 Args:
112196 tif_path: Path to input GeoTIFF file
113197 output_path: Path to output GeoJSON file
198+ country_index: Optional STRtree for country lookup
199+ country_geoms: Optional list of country geometries
200+ country_codes: Optional list of ISO3 codes
114201 """
115202 from collections import defaultdict
116203
117204 with rasterio .open (tif_path ) as src :
118205 # Read raster data (optionally resample first)
119206 image , transform = resample_categorical (src , target_res )
120207
208+ # Compute grid dimensions for cellId calculation
209+ num_rows , num_cols = image .shape
210+
121211 # Drop very small slivers before polygonization (optional)
122212 if sieve_size and sieve_size > 0 :
123213 image = sieve (image , size = sieve_size , connectivity = 8 )
@@ -128,27 +218,50 @@ def extract_geojson_from_tif(tif_path, output_path, target_res, simplify_tol, si
128218 features = []
129219
130220 if skip_dissolve :
131- # Keep individual cell polygons (optionally simplify each cell)
132- for geom , value in shapes (image , transform = transform ):
133- anthrome_code = int (value )
134-
135- if anthrome_code == nodata_value :
136- continue
137-
138- geom_shape = shape (geom )
139- if simplify_tol and simplify_tol > 0 :
140- geom_shape = geom_shape .simplify (simplify_tol , preserve_topology = True )
141-
142- # Normalize winding so D3 interprets polygons, not complements
143- geom_shape = orient_geometry (geom_shape , sign = - 1.0 )
144-
145- features .append ({
146- 'type' : 'Feature' ,
147- 'geometry' : mapping (geom_shape ),
148- 'properties' : {
149- 'anthrome' : anthrome_code
150- }
151- })
221+ # Create individual cell polygons by iterating through the grid
222+ # This guarantees each cell is its own polygon (no merging of same-value neighbors)
223+ origin_x = transform .c # left edge
224+ origin_y = transform .f # top edge
225+ pixel_width = transform .a
226+ pixel_height = - transform .e # positive value (y decreases downward)
227+
228+ for row in range (num_rows ):
229+ for col in range (num_cols ):
230+ anthrome_code = int (image [row , col ])
231+
232+ if anthrome_code == nodata_value :
233+ continue
234+
235+ # Compute cell bounds
236+ min_x = origin_x + col * pixel_width
237+ max_x = min_x + pixel_width
238+ max_y = origin_y - row * pixel_height
239+ min_y = max_y - pixel_height
240+
241+ # Create box polygon for this cell
242+ geom_shape = box (min_x , min_y , max_x , max_y )
243+
244+ # Normalize winding so D3 interprets polygons, not complements
245+ geom_shape = orient_geometry (geom_shape , sign = - 1.0 )
246+
247+ # Compute cellId
248+ cell_id = row * num_cols + col
249+
250+ # Build properties (shortened names for file size reduction)
251+ props = {'a' : anthrome_code , 'i' : cell_id }
252+
253+ # Add country if boundary data available
254+ if country_index is not None :
255+ centroid_x = (min_x + max_x ) / 2
256+ centroid_y = (min_y + max_y ) / 2
257+ country = lookup_country (Point (centroid_x , centroid_y ), country_index , country_geoms , country_codes )
258+ props ['c' ] = country # may be None for ocean cells
259+
260+ features .append ({
261+ 'type' : 'Feature' ,
262+ 'geometry' : mapping (geom_shape ),
263+ 'properties' : props
264+ })
152265 else :
153266 # Group geometries by anthrome value
154267 grouped = defaultdict (list )
@@ -180,7 +293,7 @@ def extract_geojson_from_tif(tif_path, output_path, target_res, simplify_tol, si
180293 'type' : 'Feature' ,
181294 'geometry' : mapping (simplified ),
182295 'properties' : {
183- 'anthrome ' : anthrome_code
296+ 'a ' : anthrome_code
184297 }
185298 })
186299
@@ -224,6 +337,19 @@ def main():
224337 print (" Simplify: off" )
225338 if args .sieve_size and args .sieve_size > 0 :
226339 print (f" Sieve: drop components smaller than { args .sieve_size } px" )
340+
341+ # Load country boundaries if provided
342+ country_index , country_geoms , country_codes = None , None , None
343+ if args .boundaries :
344+ if args .boundaries .exists ():
345+ country_index , country_geoms , country_codes = load_country_boundaries (args .boundaries )
346+ if country_index :
347+ print (f" Country lookup: enabled" )
348+ else :
349+ print (f" Country lookup: failed to load" )
350+ else :
351+ print (f" ⚠️ Boundaries file not found: { args .boundaries } " )
352+
227353 print (f" Output: { output_dir } \n " )
228354
229355 total_features = 0
@@ -241,6 +367,9 @@ def main():
241367 simplify_tol = args .simplify ,
242368 sieve_size = args .sieve_size ,
243369 skip_dissolve = args .skip_dissolve ,
370+ country_index = country_index ,
371+ country_geoms = country_geoms ,
372+ country_codes = country_codes ,
244373 )
245374 total_features += feature_count
246375
0 commit comments