Skip to content

Commit 44d98f7

Browse files
committed
some more overview filtering
1 parent 3d9dfb1 commit 44d98f7

3 files changed

Lines changed: 146 additions & 26 deletions

File tree

python/themachinethatgoesping/pingprocessing/overview/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .overlap_filter import (
1313
filter_by_spatial_overlap,
1414
filter_by_temporal_overlap,
15+
filter_by_speed,
1516
subset_overview,
1617
)
1718

python/themachinethatgoesping/pingprocessing/overview/overlap_filter.py

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,38 @@ def subset_overview(overview: PingOverview, indices) -> PingOverview:
8585
for key, values in overview.variables.items():
8686
subset.variables[key] = [values[i] for i in indices]
8787

88-
# Rebuild file-path lookup with only the paths actually referenced
89-
# by the kept pings, and remap file_path_index to new indices.
90-
old_indices = subset.variables.get("file_path_index", [])
91-
used_old = sorted(set(old_indices))
92-
old_to_new = {old: new for new, old in enumerate(used_old)}
88+
# Rebuild primary file-path lookup with only referenced paths.
89+
old_primary = subset.variables.get("primary_file_path_index", [])
90+
used_primary = sorted(set(old_primary))
91+
primary_old_to_new = {old: new for new, old in enumerate(used_primary)}
9392

94-
subset._file_paths = [overview._file_paths[i] for i in used_old]
95-
subset._file_path_map = {fp: i for i, fp in enumerate(subset._file_paths)}
96-
subset.variables["file_path_index"] = [old_to_new[i] for i in old_indices]
93+
subset._primary_file_paths = [
94+
overview._primary_file_paths[i] for i in used_primary
95+
]
96+
subset._primary_file_path_map = {
97+
fp: i for i, fp in enumerate(subset._primary_file_paths)
98+
}
99+
subset.variables["primary_file_path_index"] = [
100+
primary_old_to_new[i] for i in old_primary
101+
]
102+
103+
# Rebuild all-file-paths lookup with only referenced paths.
104+
used_all = set()
105+
for idx_list in subset.variables.get("file_path_indices", []):
106+
used_all.update(idx_list)
107+
used_all = sorted(used_all)
108+
all_old_to_new = {old: new for new, old in enumerate(used_all)}
109+
110+
subset._all_file_paths = [
111+
overview._all_file_paths[i] for i in used_all
112+
]
113+
subset._all_file_path_map = {
114+
fp: i for i, fp in enumerate(subset._all_file_paths)
115+
}
116+
subset.variables["file_path_indices"] = [
117+
[all_old_to_new[i] for i in idx_list]
118+
for idx_list in subset.variables.get("file_path_indices", [])
119+
]
97120

98121
subset.original_indices = indices
99122
return subset
@@ -305,3 +328,37 @@ def filter_by_temporal_overlap(
305328
subset_overview(ov, np.where(mask)[0])
306329
for ov, mask in zip(overviews, masks)
307330
]
331+
332+
333+
def filter_by_speed(
334+
overview: PingOverview,
335+
min_knots: float = None,
336+
max_knots: float = None,
337+
) -> PingOverview:
338+
"""
339+
Filter a PingOverview by inter-ping speed.
340+
341+
Computes the speed between consecutive pings and keeps only pings
342+
where the speed is within [*min_knots*, *max_knots*].
343+
344+
Parameters
345+
----------
346+
overview : PingOverview
347+
The overview to filter.
348+
min_knots : float, optional
349+
Minimum speed in knots (inclusive). Pings below this are removed.
350+
max_knots : float, optional
351+
Maximum speed in knots (inclusive). Pings above this are removed.
352+
353+
Returns
354+
-------
355+
PingOverview
356+
Filtered overview with ``original_indices`` attribute.
357+
"""
358+
speed = overview.get_speed_in_knots()
359+
mask = np.ones(len(speed), dtype=bool)
360+
if min_knots is not None:
361+
mask &= speed >= min_knots
362+
if max_knots is not None:
363+
mask &= speed <= max_knots
364+
return subset_overview(overview, np.where(mask)[0])

python/themachinethatgoesping/pingprocessing/overview/pingoverview.py

Lines changed: 80 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,10 @@ def __init__(self, ping_list: List = None, progress: bool = False) -> None:
5959
"""
6060
self.variables = defaultdict(list)
6161
self.stats = defaultdict(dict)
62-
self._file_paths = [] # unique file paths (small list)
63-
self._file_path_map = {} # path → index (for fast lookup during add)
62+
self._primary_file_paths = [] # unique primary file paths
63+
self._primary_file_path_map = {} # primary path → index
64+
self._all_file_paths = [] # unique file paths (primary + secondary)
65+
self._all_file_path_map = {} # path → index
6466

6567
if ping_list is not None:
6668
self.add_ping_list(ping_list, progress)
@@ -170,11 +172,24 @@ def add_ping(self, ping) -> None:
170172
self.variables["latitude"].append(geolocation.latitude)
171173
self.variables["longitude"].append(geolocation.longitude)
172174

173-
file_path = ping.file_data.get_primary_file_path()
174-
if file_path not in self._file_path_map:
175-
self._file_path_map[file_path] = len(self._file_paths)
176-
self._file_paths.append(file_path)
177-
self.variables["file_path_index"].append(self._file_path_map[file_path])
175+
# Primary file path (one per ping)
176+
primary_path = ping.file_data.get_primary_file_path()
177+
if primary_path not in self._primary_file_path_map:
178+
self._primary_file_path_map[primary_path] = len(self._primary_file_paths)
179+
self._primary_file_paths.append(primary_path)
180+
self.variables["primary_file_path_index"].append(
181+
self._primary_file_path_map[primary_path]
182+
)
183+
184+
# All file paths (primary + secondary, e.g. water column + bottom)
185+
all_paths = ping.file_data.get_file_paths()
186+
path_indices = []
187+
for fp in all_paths:
188+
if fp not in self._all_file_path_map:
189+
self._all_file_path_map[fp] = len(self._all_file_paths)
190+
self._all_file_paths.append(fp)
191+
path_indices.append(self._all_file_path_map[fp])
192+
self.variables["file_path_indices"].append(path_indices)
178193

179194
stats = defaultdict(dict)
180195

@@ -281,20 +296,31 @@ def get_median(self, key: str) -> float:
281296

282297
return self.stats[key]["median"]
283298

299+
def get_primary_file_paths(self) -> List[str]:
300+
"""
301+
Return the list of unique primary file paths.
302+
303+
Returns
304+
-------
305+
List[str]
306+
Unique primary file paths referenced by pings in this overview.
307+
"""
308+
return self._primary_file_paths
309+
284310
def get_file_paths(self) -> List[str]:
285311
"""
286-
Return the list of unique file paths.
312+
Return all unique file paths (primary and secondary).
287313
288314
Returns
289315
-------
290316
List[str]
291-
Unique file paths referenced by pings in this overview.
317+
All unique file paths referenced by pings in this overview.
292318
"""
293-
return self._file_paths
319+
return self._all_file_paths
294320

295-
def get_file_path(self, ping_index: int) -> str:
321+
def get_primary_file_path(self, ping_index: int) -> str:
296322
"""
297-
Return the file path for a specific ping by its index.
323+
Return the primary file path for a specific ping by its index.
298324
299325
Parameters
300326
----------
@@ -304,22 +330,58 @@ def get_file_path(self, ping_index: int) -> str:
304330
Returns
305331
-------
306332
str
307-
The file path of the ping.
333+
The primary file path of the ping.
334+
"""
335+
return self._primary_file_paths[
336+
self.variables["primary_file_path_index"][ping_index]
337+
]
338+
339+
def get_file_paths_for_ping(self, ping_index: int) -> List[str]:
340+
"""
341+
Return all file paths for a specific ping.
342+
343+
Parameters
344+
----------
345+
ping_index : int
346+
Index of the ping in this overview.
347+
348+
Returns
349+
-------
350+
List[str]
351+
All file paths associated with the ping.
352+
"""
353+
return [
354+
self._all_file_paths[i]
355+
for i in self.variables["file_path_indices"][ping_index]
356+
]
357+
358+
def get_pings_per_primary_file_path(self) -> dict:
308359
"""
309-
return self._file_paths[self.variables["file_path_index"][ping_index]]
360+
Return a mapping from primary file path to list of ping indices.
361+
362+
Returns
363+
-------
364+
dict
365+
Dictionary mapping primary_file_path → list of ping indices.
366+
"""
367+
result = defaultdict(list)
368+
for i, idx in enumerate(self.variables["primary_file_path_index"]):
369+
result[self._primary_file_paths[idx]].append(i)
370+
return dict(result)
310371

311372
def get_pings_per_file_path(self) -> dict:
312373
"""
313-
Return a mapping from file path to list of ping indices.
374+
Return a mapping from file path (any) to list of ping indices.
314375
315376
Returns
316377
-------
317378
dict
318379
Dictionary mapping file_path → list of ping indices.
319380
"""
320381
result = defaultdict(list)
321-
for i, idx in enumerate(self.variables["file_path_index"]):
322-
result[self._file_paths[idx]].append(i)
382+
for i, idx_list in enumerate(self.variables["file_path_indices"]):
383+
for idx in idx_list:
384+
result[self._all_file_paths[idx]].append(i)
323385
return dict(result)
324386

325387
def _get_minmax_per_file(self, key: str) -> dict:
@@ -338,7 +400,7 @@ def _get_minmax_per_file(self, key: str) -> dict:
338400
"""
339401
vals = self.variables[key]
340402
result = {}
341-
for fp, indices in self.get_pings_per_file_path().items():
403+
for fp, indices in self.get_pings_per_primary_file_path().items():
342404
file_vals = [vals[i] for i in indices]
343405
result[fp] = (min(file_vals), max(file_vals))
344406
return result

0 commit comments

Comments
 (0)