From 75eecf2929a0aa20535bc6d01ed7b803965c4eea Mon Sep 17 00:00:00 2001 From: Ryan May Date: Thu, 25 Sep 2025 14:55:13 -0600 Subject: [PATCH] Make WPC parser more fault tolerant (Fix #3921) When parsing a part of the file fails, allow parsing to continue with the remaining parts. --- src/metpy/io/text.py | 83 +++++++++++++++++++++++-------------------- tests/io/test_text.py | 16 +++++++++ 2 files changed, 61 insertions(+), 38 deletions(-) diff --git a/src/metpy/io/text.py b/src/metpy/io/text.py index a922f34cbda..2700f21e6a4 100644 --- a/src/metpy/io/text.py +++ b/src/metpy/io/text.py @@ -5,6 +5,7 @@ import contextlib from datetime import datetime, timezone +import logging import re import string @@ -16,6 +17,8 @@ exporter = Exporter(globals()) +log = logging.getLogger(__name__) + def _decode_coords(coordinates): """Turn a string of coordinates from WPC coded surface bulletin into a lon/lat tuple. @@ -107,44 +110,48 @@ def parse_wpc_surface_bulletin(bulletin, year=None): # A single file may have multiple sets of data that are valid at different times. Set # the valid_time string that will correspond to all the following lines parsed, until # the next valid_time is found. - if parts[0] in ('VALID', 'SURFACE PROG VALID'): - dtstr = parts[-1] - valid_time = valid_time.replace(year=year or valid_time.year, month=int(dtstr[:2]), - day=int(dtstr[2:4]), hour=int(dtstr[4:6]), - minute=0, second=0, microsecond=0) - else: - feature, *info = parts - if feature in {'HIGHS', 'LOWS'}: - # For each pressure center, add its data as a new row - # While ideally these occur in pairs, some bulletins have had multiple - # locations for a single center strength value. So instead walk one at a time - # and keep track of the most recent strength. - strength = np.nan - for item in info: - if len(item) <= 4 and item[0] in {'8', '9', '1'}: - strength = int(item) + try: + if parts[0] in ('VALID', 'SURFACE PROG VALID'): + dtstr = parts[-1] + valid_time = valid_time.replace(year=year or valid_time.year, + month=int(dtstr[:2]), day=int(dtstr[2:4]), + hour=int(dtstr[4:6]), minute=0, second=0, + microsecond=0) + else: + feature, *info = parts + if feature in {'HIGHS', 'LOWS'}: + # For each pressure center, add its data as a new row + # While ideally these occur in pairs, some bulletins have had multiple + # locations for a single center strength value. So instead walk one at a + # time and keep track of the most recent strength. + strength = np.nan + for item in info: + if len(item) <= 4 and item[0] in {'8', '9', '1'}: + strength = int(item) + else: + parsed_text.append((valid_time, feature.rstrip('S'), strength, + Point(_decode_coords(item)))) + elif feature in {'WARM', 'COLD', 'STNRY', 'OCFNT', 'TROF'}: + # Some bulletins include 'WK', 'MDT', or 'STG' to indicate the front's + # strength. If present, separate it from the rest of the info, which gives + # the position of the front. + if info[0][0] in string.ascii_letters: + strength, *boundary = info else: - parsed_text.append((valid_time, feature.rstrip('S'), strength, - Point(_decode_coords(item)))) - elif feature in {'WARM', 'COLD', 'STNRY', 'OCFNT', 'TROF'}: - # Some bulletins include 'WK', 'MDT', or 'STG' to indicate the front's - # strength. If present, separate it from the rest of the info, which gives the - # position of the front. - if info[0][0] in string.ascii_letters: - strength, *boundary = info - else: - strength, boundary = np.nan, info - - # Create a list of Points and create Line from points, if possible - boundary = [Point(_decode_coords(point)) for point in boundary] - boundary = LineString(boundary) if len(boundary) > 1 else boundary[0] - - # Add new row in the data for each front - parsed_text.append((valid_time, feature, strength, boundary)) - # Look for a year at the end of the line (from the product header) - elif (year is None and len(info) >= 2 and re.match(r'\d{4}', info[-1]) - and re.match(r'\d{2}', info[-2])): - with contextlib.suppress(ValueError): - year = int(info[-1]) + strength, boundary = np.nan, info + + # Create a list of Points and create Line from points, if possible + boundary = [Point(_decode_coords(point)) for point in boundary] + boundary = LineString(boundary) if len(boundary) > 1 else boundary[0] + + # Add new row in the data for each front + parsed_text.append((valid_time, feature, strength, boundary)) + # Look for a year at the end of the line (from the product header) + elif (year is None and len(info) >= 2 and re.match(r'\d{4}', info[-1]) + and re.match(r'\d{2}', info[-2])): + with contextlib.suppress(ValueError): + year = int(info[-1]) + except ValueError: + log.warning('Could not parse: %s', ' '.join(parts)) return pd.DataFrame(parsed_text, columns=['valid', 'feature', 'strength', 'geometry']) diff --git a/tests/io/test_text.py b/tests/io/test_text.py index 14b6ee5c03f..75e82737efe 100644 --- a/tests/io/test_text.py +++ b/tests/io/test_text.py @@ -98,3 +98,19 @@ def test_negative_lat(): """) df = parse_wpc_surface_bulletin(sample) assert df.geometry[0] == sgeom.Point([-51, -3]) + + +@needs_module('shapely') +def test_bad_line_continue(caplog): + """Test decoding of a file with some bad characters.""" + from io import BytesIO + + sample = BytesIO(b"""VALID 062818Z +HIGHS 1022 3961069 1020 3851069 1026 3750773 1022 4430845 1019 5520728 +LOWS 1016 4510934 1002 3441145 1003 4271229 1002 4471230 1009 4631181 +TROF 2971023 2831018 2691008 I2531003 +TROF 2911100 2681082 2511055 2431024 + """) + df = parse_wpc_surface_bulletin(sample) + assert len(df) == 11 + assert 'Could not parse' in caplog.text