Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 26 additions & 29 deletions src/rasterstats/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ def gen_zonal_stats(
warnings.warn("Use `band` to specify band number", DeprecationWarning)
band = band_num

perc = {pctile: get_percentile(pctile) for pctile in stats if pctile.startswith('percentile_')}

with Raster(raster, affine, nodata, band) as rast:
features_iter = read_features(vectors, layer)
for _, feat in enumerate(features_iter):
Expand Down Expand Up @@ -205,14 +207,10 @@ def gen_zonal_stats(
if 'count' in stats: # special case, zero makes sense here
feature_stats['count'] = 0
else:
valid = masked.compressed()
keys, counts = np.unique(valid, return_counts=True)
Copy link
Author

@groutr groutr Nov 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unique elements, keys, has the really useful property of being sorted. We can take advantage of that for several stats and thus eliminate two O(n) passes (min, max) over the raster data. We can also use it to avoid a sort in computing the median.

Copy link
Owner

@perrygeo perrygeo Jan 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice idea. I like this approach. Two comments:

  • We could eliminate the np.unique call when the stats do not require it. I guess most stats do now... but we could avoid it entirely if the stats were only mean, count or std

  • This impacts memory usage. Keeping the entire keys and counts arrays in memory for the remainder of the loop might be a bad tradeoff in some cases. Prior to this PR, the unique stat was the only one that called np.unique, and users had to opt in. That stat is intended for categorical rasters - using it on continuous values would not make much sense. Now we're calling np.unique on every invocation and using that to derive other stats - fine in theory. But for continuous rasters or those with 16+ bits per band, there's not likely to be any duplicates - meaning we're allocating a keys and counts array that are both roughly the length of the masked array itself (possible 200% increase in memory footprint). This causes a performance regression for several use cases; code which never had to call np.unique now does, and allocates huge chunks of memory as a result. Multiple O(n) passes have the advantage of not requiring new allocations unless unique is specifically requested.

So while this may speed up some use cases, it may slow down others and will certainly cause additional resource usage. I'll have to do some more testing to quantify the impact and evaluate the tradeoffs

if run_count:
keys, counts = np.unique(masked.compressed(), return_counts=True)
try:
pixel_count = dict(zip([k.item() for k in keys],
[c.item() for c in counts]))
except AttributeError:
pixel_count = dict(zip([np.asscalar(k) for k in keys],
[np.asscalar(c) for c in counts]))
pixel_count = dict(zip(keys.tolist(), counts.tolist()))
Copy link
Author

@groutr groutr Nov 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This builds pixel_count in C instead of Python, and is consequently significantly faster.
Also, pixel_count should really be a list of tuples. Floats do not make good keys in hash table.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea on the pixel_count calcs.

Tuples would work, though I'm hestitant to change that now due to backwards compatibility with existing code. I'd consider it thought. But In practice, categorical rasters (the only rasters it would make sense to request pixel counts on) are going to be some integer dtype. A categorical float would be exceedingly rare.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps adding a note about this stat being intended for categorical rasters in the docstring would be enough.


if categorical:
feature_stats = dict(pixel_count)
Expand All @@ -222,41 +220,40 @@ def gen_zonal_stats(
feature_stats = {}

if 'min' in stats:
feature_stats['min'] = float(masked.min())
feature_stats['min'] = float(keys[0])
if 'max' in stats:
feature_stats['max'] = float(masked.max())
feature_stats['max'] = float(keys[-1])
if 'mean' in stats:
feature_stats['mean'] = float(masked.mean())
feature_stats['mean'] = float(valid.mean())
if 'count' in stats:
feature_stats['count'] = int(masked.count())
feature_stats['count'] = len(valid)
# optional
if 'sum' in stats:
feature_stats['sum'] = float(masked.sum())
feature_stats['sum'] = float((keys * counts).sum())
if 'std' in stats:
feature_stats['std'] = float(masked.std())
feature_stats['std'] = float(valid.std())
if 'median' in stats:
feature_stats['median'] = float(np.median(masked.compressed()))
# we have unique keys and counts
# which means we can avoid sorting to find median
sortedvalid = np.repeat(keys, counts)
size = len(sortedvalid)
if size % 2 == 1:
median = sortedvalid[size//2]
else:
median = (sortedvalid[size//2] + sortedvalid[size//2+1])/2
feature_stats['median'] = float(median)
if 'majority' in stats:
feature_stats['majority'] = float(key_assoc_val(pixel_count, max))
if 'minority' in stats:
feature_stats['minority'] = float(key_assoc_val(pixel_count, min))
if 'unique' in stats:
feature_stats['unique'] = len(list(pixel_count.keys()))
feature_stats['unique'] = len(keys)
if 'range' in stats:
try:
rmin = feature_stats['min']
except KeyError:
rmin = float(masked.min())
try:
rmax = feature_stats['max']
except KeyError:
rmax = float(masked.max())
feature_stats['range'] = rmax - rmin

for pctile in [s for s in stats if s.startswith('percentile_')]:
q = get_percentile(pctile)
pctarr = masked.compressed()
feature_stats[pctile] = np.percentile(pctarr, q)
feature_stats['range'] = float(keys[-1]) - float(keys[0])

if perc:
ptiles = np.percentile(valid, list(perc.values()))
feature_stats.update(zip(perc.keys(), ptiles.tolist()))

if 'nodata' in stats or 'nan' in stats:
featmasked = np.ma.MaskedArray(fsrc.array, mask=(~rv_array))
Expand Down
4 changes: 2 additions & 2 deletions tests/test_zonal.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def mymean(x):

stats = zonal_stats(polygons, raster, add_stats={'mymean': mymean})
for i in range(len(stats)):
assert stats[i]['mean'] == stats[i]['mymean']
assert np.isclose(stats[i]['mean'], stats[i]['mymean'])


def test_add_stats_prop():
Expand All @@ -307,7 +307,7 @@ def mymean_prop(x, prop):

stats = zonal_stats(polygons, raster, add_stats={'mymean_prop': mymean_prop})
for i in range(len(stats)):
assert stats[i]['mymean_prop'] == stats[i]['mean'] * (i+1)
assert np.isclose(stats[i]['mymean_prop'], stats[i]['mean'] * (i+1))


def test_mini_raster():
Expand Down