From 041af2d092b0a79587146d002d823e3fea156a91 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sat, 25 Oct 2025 00:18:03 +0100 Subject: [PATCH 01/43] Initial tests. --- lib/iris/fileformats/cf.py | 2 +- .../integration/netcdf/test_chararrays.py | 112 ++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 lib/iris/tests/integration/netcdf/test_chararrays.py diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 2b6568c315..b65ab70792 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -802,7 +802,7 @@ def cf_label_data(self, cf_data_var): label_data = self[:] if ma.isMaskedArray(label_data): - label_data = label_data.filled() + label_data = label_data.filled(b"\0") # Determine whether we have a string-valued scalar label # i.e. a character variable that only has one dimension (the length of the string). diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py new file mode 100644 index 0000000000..feb93047dd --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -0,0 +1,112 @@ +import netCDF4 as nc +import numpy as np +import pytest + +import iris + +NX, N_STRLEN = 3, 64 +TEST_STRINGS = ["Münster", "London", "Amsterdam"] +TEST_COORD_VALS = ["bun", "éclair", "sandwich"] + + +def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +INCLUDE_COORD = True +# INCLUDE_COORD = False + + +def make_testfile(filepath, chararray, coordarray, encoding_str=None): + with nc.Dataset(filepath, "w") as ds: + ds.createDimension("x", NX) + ds.createDimension("nstr", N_STRLEN) + vx = ds.createVariable("x", int, dimensions=("x")) + vx[:] = np.arange(NX) + if INCLUDE_COORD: + ds.createDimension("nstr2", N_STRLEN) + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2", + ), + ) + v_co[:] = coordarray + if encoding_str is not None: + v_co._Encoding = encoding_str + v = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v[:] = chararray + if encoding_str is not None: + v._Encoding = encoding_str + if INCLUDE_COORD: + v.coordinates = "v_co" + + +def show_result(filepath): + from pp_utils import ncdump + + print(f"File {filepath}") + print("NCDUMP:") + ncdump(filepath, "") + # with nc.Dataset(filepath, "r") as ds: + # v = ds.variables["v"] + # print("\n----\nNetcdf data readback (basic)") + # try: + # print(repr(v[:])) + # except UnicodeDecodeError as err: + # print(repr(err)) + # print("..raw:") + # v.set_auto_chartostring(False) + # print(repr(v[:])) + print("\nAs iris cube..") + try: + cube = iris.load_cube(filepath) + print(cube) + if iris.loading.LOAD_PROBLEMS._problems: + print(iris.loading.LOAD_PROBLEMS) + print( + "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format()) + ) + print("-data-") + print(repr(cube.data)) + if INCLUDE_COORD: + print("-coord data-") + try: + print(repr(cube.coord("v_co").points)) + except Exception as err2: + print(repr(err2)) + except UnicodeDecodeError as err: + print(repr(err)) + + +# tsts = (None, "ascii", "utf-8", "utf-32",) +# tsts = ("utf-8",) +# tsts = ("utf-8", "utf-32",) +# tsts = ("utf-32",) +tsts = ("utf-8", "ascii", "utf-8") + + +@pytest.mark.parametrize("encoding", tsts) +def test_encodings(encoding): + print(f"\n=========\nTesting encoding: {encoding}") + filepath = f"tmp_{str(encoding)}.nc" + do_as = encoding + if encoding != "utf-32": + do_as = "utf-8" + TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as) + TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as) + make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) + show_result(filepath) From 65bd9ddfbca73597a86a8059f53291f2828779b6 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sat, 25 Oct 2025 01:22:30 +0100 Subject: [PATCH 02/43] Get 'create_cf_data_variable' to call 'create_generic_cf_array_var': Mostly working? Get 'create_cf_data_variable' to call 'create_generic_cf_array_var': Mostly working? --- .../fileformats/_nc_load_rules/helpers.py | 8 +- lib/iris/fileformats/netcdf/saver.py | 158 +++++++++--------- .../integration/netcdf/test_chararrays.py | 1 + 3 files changed, 85 insertions(+), 82 deletions(-) diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 35c2e96924..50e282db5f 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine): ), ) if problem is not None: - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Skipping disallowed global attribute '{attr_name}' (see above error)" ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] ################################################################################ @@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate( ) if problem is not None: coord_var_name = str(cf_coord_var.cf_name) - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Failed to create {coord_var_name} dimension coordinate:\n" f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead." ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] problem.handled = True _ = _add_or_capture( diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 5177749c07..bd4e87471f 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -759,7 +759,7 @@ def _create_cf_dimensions(self, cube, dimension_names, unlimited_dimensions=None # used for a different one pass else: - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) unlimited_dim_names.append(dim_name) for dim_name in dimension_names: @@ -990,12 +990,12 @@ def _add_aux_coords( ] # Include any relevant mesh location coordinates. - mesh: MeshXY | None = getattr(cube, "mesh") - mesh_location: str | None = getattr(cube, "location") + mesh: MeshXY | None = getattr(cube, "mesh") # type: ignore[annotation-unchecked] + mesh_location: str | None = getattr(cube, "location") # type: ignore[annotation-unchecked] if mesh and mesh_location: location_coords: MeshNodeCoords | MeshEdgeCoords | MeshFaceCoords = getattr( mesh, f"{mesh_location}_coords" - ) + ) # type: ignore[annotation-unchecked] coords_to_add.extend(list(location_coords)) return self._add_inner_related_vars( @@ -1365,7 +1365,7 @@ def record_dimension(names_list, dim_name, length, matching_coords=None): if dim_name is None: # Not already present : create a unique dimension name # from the coord. - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) # Disambiguate if it has the same name as an # existing dimension. # OR if it matches an existing file variable name. @@ -1541,38 +1541,14 @@ def _create_cf_bounds(self, coord, cf_var, cf_name, /, *, compression_kwargs=Non ) self._lazy_stream_data(data=bounds, cf_var=cf_var_bounds) - def _get_cube_variable_name(self, cube): - """Return a CF-netCDF variable name for the given cube. - - Parameters - ---------- - cube : :class:`iris.cube.Cube` - An instance of a cube for which a CF-netCDF variable - name is required. - - Returns - ------- - str - A CF-netCDF variable name as a string. - - """ - if cube.var_name is not None: - cf_name = cube.var_name - else: - # Convert to lower case and replace whitespace by underscores. - cf_name = "_".join(cube.name().lower().split()) - - cf_name = self.cf_valid_var_name(cf_name) - return cf_name - - def _get_coord_variable_name(self, cube_or_mesh, coord): - """Return a CF-netCDF variable name for a given coordinate-like element. + def _get_element_variable_name(self, cube_or_mesh, element): + """Return a CF-netCDF variable name for a given coordinate-like element, or cube. Parameters ---------- cube_or_mesh : :class:`iris.cube.Cube` or :class:`iris.mesh.MeshXY` The Cube or Mesh being saved to the netCDF file. - coord : :class:`iris.coords._DimensionalMetadata` + element : :class:`iris.coords._DimensionalMetadata` | :class:``iris.cube.Cube`` An instance of a coordinate (or similar), for which a CF-netCDF variable name is required. @@ -1592,17 +1568,21 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): cube = None mesh = cube_or_mesh - if coord.var_name is not None: - cf_name = coord.var_name + if element.var_name is not None: + cf_name = element.var_name + elif isinstance(element, Cube): + # Make name for a Cube without a var_name. + cf_name = "_".join(element.name().lower().split()) else: - name = coord.standard_name or coord.long_name + # Make name for a Coord-like element without a var_name + name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.coords(coord): - # It is a regular cube coordinate. + if cube is not None and cube.elements(element): + # It is a regular cube elementinate. # Auto-generate a name based on the dims. name = "" - for dim in cube.coord_dims(coord): + for dim in cube.coord_dims(element): name += f"dim{dim}" # Handle scalar coordinate (dims == ()). if not name: @@ -1616,8 +1596,8 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): # At present, a location-coord cannot be nameless, as the # MeshXY code relies on guess_coord_axis. - assert isinstance(coord, Connectivity) - location = coord.cf_role.split("_")[0] + assert isinstance(element, Connectivity) + location = element.cf_role.split("_")[0] location_dim_attr = f"{location}_dimension" name = getattr(mesh, location_dim_attr) @@ -1693,6 +1673,8 @@ def _create_mesh(self, mesh): return cf_mesh_name def _set_cf_var_attributes(self, cf_var, element): + from iris.cube import Cube + # Deal with CF-netCDF units, and add the name+units properties. if isinstance(element, iris.coords.Coord): # Fix "degree" units if needed. @@ -1715,19 +1697,21 @@ def _set_cf_var_attributes(self, cf_var, element): if element.units.calendar: _setncattr(cf_var, "calendar", str(element.units.calendar)) - # Add any other custom coordinate attributes. - for name in sorted(element.attributes): - value = element.attributes[name] + if not isinstance(element, Cube): + # Add any other custom coordinate attributes. + # N.B. not Cube, which has specific handling in _create_cf_data_variable + for name in sorted(element.attributes): + value = element.attributes[name] - if name == "STASH": - # Adopting provisional Metadata Conventions for representing MO - # Scientific Data encoded in NetCDF Format. - name = "um_stash_source" - value = str(value) + if name == "STASH": + # Adopting provisional Metadata Conventions for representing MO + # Scientific Data encoded in NetCDF Format. + name = "um_stash_source" + value = str(value) - # Don't clobber existing attributes. - if not hasattr(cf_var, name): - _setncattr(cf_var, name, value) + # Don't clobber existing attributes. + if not hasattr(cf_var, name): + _setncattr(cf_var, name, value) def _create_generic_cf_array_var( self, @@ -1739,6 +1723,7 @@ def _create_generic_cf_array_var( element_dims=None, fill_value=None, compression_kwargs=None, + is_dataless=False, ): """Create theCF-netCDF variable given dimensional_metadata. @@ -1791,7 +1776,7 @@ def _create_generic_cf_array_var( # Work out the var-name to use. # N.B. the only part of this routine that may use a mesh _or_ a cube. - cf_name = self._get_coord_variable_name(cube_or_mesh, element) + cf_name = self._get_element_variable_name(cube_or_mesh, element) while cf_name in self._dataset.variables: cf_name = self._increment_name(cf_name) @@ -1804,10 +1789,13 @@ def _create_generic_cf_array_var( # Get the data values, in a way which works for any element type, as # all are subclasses of _DimensionalMetadata. # (e.g. =points if a coord, =data if an ancillary, etc) - data = element._core_values() + if isinstance(element, Cube): + data = element.core_data() + else: + data = element._core_values() # This compression contract is *not* applicable to a mesh. - if cube and cube.shape != data.shape: + if cube is not None and data is not None and cube.shape != data.shape: compression_kwargs = {} if np.issubdtype(data.dtype, np.str_): @@ -1837,11 +1825,13 @@ def _create_generic_cf_array_var( # Convert data from an array of strings into a character array # with an extra string-length dimension. if len(element_dims) == 1: + # Scalar variable (only has string dimension). data_first = data[0] if is_lazy_data(data_first): data_first = dask.compute(data_first) data = list("%- *s" % (string_dimension_depth, data_first)) else: + # NOTE: at present, can't do this lazily?? orig_shape = data.shape new_shape = orig_shape + (string_dimension_depth,) new_data = np.zeros(new_shape, cf_var.dtype) @@ -1850,7 +1840,7 @@ def _create_generic_cf_array_var( new_data[index_slice] = list( "%- *s" % (string_dimension_depth, data[index]) ) - data = new_data + data = new_data else: # A normal (numeric) variable. # ensure a valid datatype for the file format. @@ -1887,7 +1877,8 @@ def _create_generic_cf_array_var( ) # Add the data to the CF-netCDF variable. - self._lazy_stream_data(data=data, cf_var=cf_var) + if not is_dataless: + self._lazy_stream_data(data=data, cf_var=cf_var) # Add names + units self._set_cf_var_attributes(cf_var, element) @@ -2238,9 +2229,9 @@ def _create_cf_grid_mapping(self, cube, cf_var_cube): cfvar = self._name_coord_map.name(coord) if not cfvar: # not found - create and store it: - cfvar = self._get_coord_variable_name(cube, coord) + cfvar = self._get_element_variable_name(cube, coord) self._name_coord_map.append( - cfvar, self._get_coord_variable_name(cube, coord) + cfvar, self._get_element_variable_name(cube, coord) ) cfvar_names.append(cfvar) @@ -2383,32 +2374,43 @@ def set_packing_ncattrs(cfvar): if add_offset: _setncattr(cfvar, "add_offset", add_offset) - cf_name = self._get_cube_variable_name(cube) - while cf_name in self._dataset.variables: - cf_name = self._increment_name(cf_name) - + # cf_name = self._get_element_variable_name(cube_or_mesh=None, element=cube) + # while cf_name in self._dataset.variables: + # cf_name = self._increment_name(cf_name) + # + # cf_var = self._dataset.createVariable( + # cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs + # ) # Create the cube CF-netCDF data variable with data payload. - cf_var = self._dataset.createVariable( - cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs + cf_name = self._create_generic_cf_array_var( + cube, + dimension_names, + cube, + element_dims=dimension_names, + fill_value=fill_value, + compression_kwargs=kwargs, + is_dataless=is_dataless, ) + cf_var = self._dataset.variables[cf_name] if not is_dataless: set_packing_ncattrs(cf_var) - self._lazy_stream_data(data=data, cf_var=cf_var) - - if cube.standard_name: - _setncattr(cf_var, "standard_name", cube.standard_name) - - if cube.long_name: - _setncattr(cf_var, "long_name", cube.long_name) - - if cube.units.is_udunits(): - _setncattr(cf_var, "units", str(cube.units)) - - # Add the CF-netCDF calendar attribute. - if cube.units.calendar: - _setncattr(cf_var, "calendar", cube.units.calendar) + # if cube.standard_name: + # _setncattr(cf_var, "standard_name", cube.standard_name) + # + # if cube.long_name: + # _setncattr(cf_var, "long_name", cube.long_name) + # + # if cube.units.is_udunits(): + # _setncattr(cf_var, "units", str(cube.units)) + # + # # Add the CF-netCDF calendar attribute. + # if cube.units.calendar: + # _setncattr(cf_var, "calendar", cube.units.calendar) + + # Set attributes: NB this part is cube-specific (not the same for components) + # - therefore 'set_cf_var_attributes' doesn't set attributes if element is a Cube if iris.FUTURE.save_split_attrs: attr_names = cube.attributes.locals.keys() else: diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index feb93047dd..a3ce9f9128 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -101,6 +101,7 @@ def show_result(filepath): @pytest.mark.parametrize("encoding", tsts) def test_encodings(encoding): + # small change print(f"\n=========\nTesting encoding: {encoding}") filepath = f"tmp_{str(encoding)}.nc" do_as = encoding From d75a7a79831977de341a17f8a2a11d4ee276c902 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Tue, 28 Oct 2025 21:11:15 +0000 Subject: [PATCH 03/43] Reinstate decode on load, now in-Iris coded. --- .../fileformats/_nc_load_rules/helpers.py | 10 ++- lib/iris/fileformats/cf.py | 18 +++++- .../fileformats/netcdf/_thread_safe_nc.py | 45 +++++++++++-- lib/iris/fileformats/netcdf/loader.py | 38 ++++++++++- lib/iris/fileformats/netcdf/saver.py | 4 +- .../integration/netcdf/test_chararrays.py | 64 ++++++++++++++++++- lib/iris/util.py | 21 ++++++ 7 files changed, 184 insertions(+), 16 deletions(-) diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 50e282db5f..fa63002f09 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate( # Determine the name of the dimension/s shared between the CF-netCDF data variable # and the coordinate being built. - common_dims = [ - dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions - ] + coord_dims = cf_coord_var.dimensions + if cf._is_str_dtype(cf_coord_var): + coord_dims = coord_dims[:-1] + datavar_dims = engine.cf_var.dimensions + if cf._is_str_dtype(engine.cf_var): + datavar_dims = datavar_dims[:-1] + common_dims = [dim for dim in coord_dims if dim in datavar_dims] data_dims = None if common_dims: # Calculate the offset of each common dimension. diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index b65ab70792..5abc525109 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -790,15 +790,27 @@ def cf_label_data(self, cf_data_var): # Determine the name of the label string (or length) dimension by # finding the dimension name that doesn't exist within the data dimensions. - str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions)) + str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions)) + n_nondata_dims = len(str_dim_names) + + if n_nondata_dims == 0: + # *All* dims are shared with the data-variable. + # This is only ok if the data-var is *also* a string type. + dim_ok = _is_str_dtype(cf_data_var) + # In this case, we must just *assume* that the last dimension is "the" + # string dimension + str_dim_name = self.dimensions[-1] + else: + # If there is exactly one non-data dim, that is the one we want + dim_ok = len(str_dim_names) == 1 + (str_dim_name,) = str_dim_names - if len(str_dim_name) != 1: + if not dim_ok: raise ValueError( "Invalid string dimensions for CF-netCDF label variable %r" % self.cf_name ) - str_dim_name = str_dim_name[0] label_data = self[:] if ma.isMaskedArray(label_data): diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 33183ef0fa..4b3dc10620 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -311,14 +311,39 @@ def fromcdl(cls, *args, **kwargs): class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" - __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") - - def __init__(self, shape, dtype, path, variable_name, fill_value): + __slots__ = ( + "shape", + "dtype", + "path", + "variable_name", + "fill_value", + "is_bytes", + "encoding", + "string_length", + ) + + def __init__( + self, + shape, + dtype, + path, + variable_name, + fill_value, + encoding: str | None = None, + string_length: int = 0, + ): self.shape = shape self.dtype = dtype self.path = path self.variable_name = variable_name self.fill_value = fill_value + self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1 + if self.is_bytes: + # We will be returning a different shape : the last dim is the byte-length + self.shape = self.shape[:-1] + self.dtype = np.dtype(f"U{string_length}") + self.encoding = encoding + self.string_length = string_length @property def ndim(self): @@ -338,10 +363,20 @@ def __getitem__(self, keys): try: variable = dataset.variables[self.variable_name] # Get the NetCDF variable data and slice. - var = variable[keys] + data = variable[keys] + + # If bytes, decode to strings + if self.is_bytes: + from iris.util import convert_bytesarray_to_strings + + data = convert_bytesarray_to_strings( + data, + encoding=self.encoding, + string_length=self.string_length, + ) finally: dataset.close() - return np.asanyarray(var) + return np.asanyarray(data) def __repr__(self): fmt = ( diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 219f681e67..d27c3b64b8 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -11,6 +11,7 @@ """ +import codecs from collections.abc import Iterable, Iterator, Mapping from contextlib import contextmanager from copy import deepcopy @@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var): # Normal NCVariable type: total_bytes = cf_var.size * cf_var.dtype.itemsize + default_encoding = "utf-8" + encoding = getattr(cf_var, "_Encoding", None) + if encoding is None: + # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data + encoding = default_encoding + else: + try: + # Accept + normalise naming of encodings + encoding = codecs.lookup(encoding).name + # NOTE: if encoding does not suit data, errors can occur. + # For example, _Encoding = "ascii", with non-ascii content. + except LookupError: + # Replace some invalid setting with "safe"(ish) fallback. + encoding = default_encoding + + string_length = getattr(cf_var, "iris_string_length", None) + if total_bytes < _LAZYVAR_MIN_BYTES: # Don't make a lazy array, as it will cost more memory AND more time to access. result = cf_var[:] + if result.dtype.kind == "S": + from iris.util import convert_bytesarray_to_strings + + result = convert_bytesarray_to_strings( + result, + encoding=encoding, + string_length=string_length, + ) + # Special handling of masked scalar value; this will be returned as # an `np.ma.masked` instance which will lose the original dtype. # Workaround for this it return a 1-element masked array of the @@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var): "_FillValue", _thread_safe_nc.default_fillvals[fill_dtype], ) + + # NOTE: if the data is bytes which need to be converted to strings on read, + # the data-proxy will do that (and it modifies its shape + dtype). proxy = NetCDFDataProxy( - cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value + cf_var.shape, + dtype, + cf_var.filename, + cf_var.cf_name, + fill_value, + encoding=encoding, + string_length=string_length, ) # Get the chunking specified for the variable : this is either a shape, or # maybe the string "contiguous". diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index bd4e87471f..d885387a7f 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element): name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.elements(element): - # It is a regular cube elementinate. + if cube is not None and cube.coords(element): + # It is a regular cube coordinate. # Auto-generate a name based on the dims. name = "" for dim in cube.coord_dims(element): diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index a3ce9f9128..8f29fcdcd5 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -4,10 +4,18 @@ import iris +iris.FUTURE.save_split_attrs = True + + NX, N_STRLEN = 3, 64 TEST_STRINGS = ["Münster", "London", "Amsterdam"] TEST_COORD_VALS = ["bun", "éclair", "sandwich"] +# VARS_COORDS_SHARE_STRING_DIM = True +VARS_COORDS_SHARE_STRING_DIM = False +if VARS_COORDS_SHARE_STRING_DIM: + TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one + def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): bbytes = [text.encode(encoding) for text in string_array_1d] @@ -17,9 +25,33 @@ def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): return chararray +def convert_bytesarray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + INCLUDE_COORD = True # INCLUDE_COORD = False +INCLUDE_NUMERIC_AUXCOORD = True +# INCLUDE_NUMERIC_AUXCOORD = False + def make_testfile(filepath, chararray, coordarray, encoding_str=None): with nc.Dataset(filepath, "w") as ds: @@ -40,6 +72,13 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): v_co[:] = coordarray if encoding_str is not None: v_co._Encoding = encoding_str + if INCLUDE_NUMERIC_AUXCOORD: + v_num = ds.createVariable( + "v_num", + float, + dimensions=("x",), + ) + v_num[:] = np.arange(NX) v = ds.createVariable( "v", "S1", @@ -52,7 +91,10 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): if encoding_str is not None: v._Encoding = encoding_str if INCLUDE_COORD: - v.coordinates = "v_co" + coords_str = "v_co" + if INCLUDE_NUMERIC_AUXCOORD: + coords_str += " v_num" + v.coordinates = coords_str def show_result(filepath): @@ -82,8 +124,10 @@ def show_result(filepath): ) print("-data-") print(repr(cube.data)) + print("-numeric auxcoord data-") + print(repr(cube.coord("x").points)) if INCLUDE_COORD: - print("-coord data-") + print("-string auxcoord data-") try: print(repr(cube.coord("v_co").points)) except Exception as err2: @@ -111,3 +155,19 @@ def test_encodings(encoding): TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as) make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) show_result(filepath) + + +# @pytest.mark.parametrize("ndim", [1, 2]) +# def test_convert_bytes_to_strings(ndim: int): +# if ndim == 1: +# source = convert_strings_to_chararray(TEST_STRINGS, 16) +# elif ndim == 2: +# source = np.stack([ +# convert_strings_to_chararray(TEST_STRINGS, 16), +# convert_strings_to_chararray(TEST_COORD_VALS, 16), +# ]) +# else: +# raise ValueError(f"Unexpected param ndim={ndim}.") +# # convert the strings to bytes +# result = convert_bytesarray_to_strings(source) +# print(result) diff --git a/lib/iris/util.py b/lib/iris/util.py index 2c413d2822..193a95c8ce 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -3183,3 +3183,24 @@ def set( # Global CML settings object for use as context manager CML_SETTINGS: CMLSettings = CMLSettings() + + +def convert_bytesarray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result From 07efc0634bc699b7d8777d6b343b15e199c02c29 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sun, 7 Dec 2025 00:34:53 +0000 Subject: [PATCH 04/43] Revert and amend. --- .../fileformats/netcdf/_thread_safe_nc.py | 45 +++---------------- lib/iris/fileformats/netcdf/loader.py | 38 +--------------- lib/iris/fileformats/netcdf/saver.py | 4 +- lib/iris/util.py | 21 --------- 4 files changed, 8 insertions(+), 100 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 4b3dc10620..33183ef0fa 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -311,39 +311,14 @@ def fromcdl(cls, *args, **kwargs): class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" - __slots__ = ( - "shape", - "dtype", - "path", - "variable_name", - "fill_value", - "is_bytes", - "encoding", - "string_length", - ) - - def __init__( - self, - shape, - dtype, - path, - variable_name, - fill_value, - encoding: str | None = None, - string_length: int = 0, - ): + __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") + + def __init__(self, shape, dtype, path, variable_name, fill_value): self.shape = shape self.dtype = dtype self.path = path self.variable_name = variable_name self.fill_value = fill_value - self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1 - if self.is_bytes: - # We will be returning a different shape : the last dim is the byte-length - self.shape = self.shape[:-1] - self.dtype = np.dtype(f"U{string_length}") - self.encoding = encoding - self.string_length = string_length @property def ndim(self): @@ -363,20 +338,10 @@ def __getitem__(self, keys): try: variable = dataset.variables[self.variable_name] # Get the NetCDF variable data and slice. - data = variable[keys] - - # If bytes, decode to strings - if self.is_bytes: - from iris.util import convert_bytesarray_to_strings - - data = convert_bytesarray_to_strings( - data, - encoding=self.encoding, - string_length=self.string_length, - ) + var = variable[keys] finally: dataset.close() - return np.asanyarray(data) + return np.asanyarray(var) def __repr__(self): fmt = ( diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index d27c3b64b8..219f681e67 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -11,7 +11,6 @@ """ -import codecs from collections.abc import Iterable, Iterator, Mapping from contextlib import contextmanager from copy import deepcopy @@ -270,36 +269,10 @@ def _get_cf_var_data(cf_var): # Normal NCVariable type: total_bytes = cf_var.size * cf_var.dtype.itemsize - default_encoding = "utf-8" - encoding = getattr(cf_var, "_Encoding", None) - if encoding is None: - # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data - encoding = default_encoding - else: - try: - # Accept + normalise naming of encodings - encoding = codecs.lookup(encoding).name - # NOTE: if encoding does not suit data, errors can occur. - # For example, _Encoding = "ascii", with non-ascii content. - except LookupError: - # Replace some invalid setting with "safe"(ish) fallback. - encoding = default_encoding - - string_length = getattr(cf_var, "iris_string_length", None) - if total_bytes < _LAZYVAR_MIN_BYTES: # Don't make a lazy array, as it will cost more memory AND more time to access. result = cf_var[:] - if result.dtype.kind == "S": - from iris.util import convert_bytesarray_to_strings - - result = convert_bytesarray_to_strings( - result, - encoding=encoding, - string_length=string_length, - ) - # Special handling of masked scalar value; this will be returned as # an `np.ma.masked` instance which will lose the original dtype. # Workaround for this it return a 1-element masked array of the @@ -322,17 +295,8 @@ def _get_cf_var_data(cf_var): "_FillValue", _thread_safe_nc.default_fillvals[fill_dtype], ) - - # NOTE: if the data is bytes which need to be converted to strings on read, - # the data-proxy will do that (and it modifies its shape + dtype). proxy = NetCDFDataProxy( - cf_var.shape, - dtype, - cf_var.filename, - cf_var.cf_name, - fill_value, - encoding=encoding, - string_length=string_length, + cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value ) # Get the chunking specified for the variable : this is either a shape, or # maybe the string "contiguous". diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index d885387a7f..bd4e87471f 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element): name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.coords(element): - # It is a regular cube coordinate. + if cube is not None and cube.elements(element): + # It is a regular cube elementinate. # Auto-generate a name based on the dims. name = "" for dim in cube.coord_dims(element): diff --git a/lib/iris/util.py b/lib/iris/util.py index 193a95c8ce..2c413d2822 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -3183,24 +3183,3 @@ def set( # Global CML settings object for use as context manager CML_SETTINGS: CMLSettings = CMLSettings() - - -def convert_bytesarray_to_strings( - byte_array, encoding="utf-8", string_length: int | None = None -): - """Convert bytes to strings. - - N.B. for now at least, we assume the string dim is **always the last one**. - """ - bytes_shape = byte_array.shape - var_shape = bytes_shape[:-1] - if string_length is None: - string_length = bytes_shape[-1] - string_dtype = f"U{string_length}" - result = np.empty(var_shape, dtype=string_dtype) - for ndindex in np.ndindex(var_shape): - element_bytes = byte_array[ndindex] - bytes = b"".join([b if b else b"\0" for b in element_bytes]) - string = bytes.decode(encoding) - result[ndindex] = string - return result From 232107775c424b30233ad336d8fbfd81913a57c2 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 12:23:07 +0000 Subject: [PATCH 05/43] Hack to preserve the existing order of attributes on saved Coords and Cubes. --- lib/iris/fileformats/netcdf/saver.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index bd4e87471f..8fb0fec377 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1682,12 +1682,26 @@ def _set_cf_var_attributes(self, cf_var, element): else: units_str = str(element.units) - if cf_units.as_unit(units_str).is_udunits(): - _setncattr(cf_var, "units", units_str) + # NB this bit is a nasty hack to preserve existing behaviour through a refactor: + # The attributes for Coords are created in the order units, standard_name, + # whereas for data-variables (aka Cubes) it is the other way around. + # Needed now that this routine is also called from _create_cf_data_variable. + # TODO: when we can break things, rationalise these to be the same. + def add_units(): + if cf_units.as_unit(units_str).is_udunits(): + _setncattr(cf_var, "units", units_str) + + def add_stdname(): + standard_name = element.standard_name + if standard_name is not None: + _setncattr(cf_var, "standard_name", standard_name) - standard_name = element.standard_name - if standard_name is not None: - _setncattr(cf_var, "standard_name", standard_name) + if isinstance(element, Cube): + add_stdname() + add_units() + else: + add_units() + add_stdname() long_name = element.long_name if long_name is not None: From 0174e53a443167ab0ec47ae826b140d2abd57116 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 14:54:33 +0000 Subject: [PATCH 06/43] Fix for dataless; avoid FUTURE global state change from temporary tests. --- lib/iris/fileformats/netcdf/saver.py | 30 ++++---- .../integration/netcdf/test_chararrays.py | 72 ++++++++++++++++--- 2 files changed, 75 insertions(+), 27 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 8fb0fec377..c2522d8867 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1812,7 +1812,7 @@ def _create_generic_cf_array_var( if cube is not None and data is not None and cube.shape != data.shape: compression_kwargs = {} - if np.issubdtype(data.dtype, np.str_): + if not is_dataless and np.issubdtype(data.dtype, np.str_): # Deal with string-type variables. # Typically CF label variables, but also possibly ancil-vars ? string_dimension_depth = data.dtype.itemsize @@ -1858,8 +1858,13 @@ def _create_generic_cf_array_var( else: # A normal (numeric) variable. # ensure a valid datatype for the file format. - element_type = type(element).__name__ - data = self._ensure_valid_dtype(data, element_type, element) + if is_dataless: + dtype = self._DATALESS_DTYPE + fill_value = self._DATALESS_FILLVALUE + else: + element_type = type(element).__name__ + data = self._ensure_valid_dtype(data, element_type, element) + dtype = data.dtype.newbyteorder("=") # Check if this is a dim-coord. is_dimcoord = cube is not None and element in cube.dim_coords @@ -1873,7 +1878,7 @@ def _create_generic_cf_array_var( # Create the CF-netCDF variable. cf_var = self._dataset.createVariable( cf_name, - data.dtype.newbyteorder("="), + dtype, element_dims, fill_value=fill_value, **compression_kwargs, @@ -2325,19 +2330,12 @@ def _create_cf_data_variable( # be removed. # Get the values in a form which is valid for the file format. is_dataless = cube.is_dataless() - if is_dataless: - data = None - else: - data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) - if is_dataless: - # The variable must have *some* dtype, and it must be maskable - dtype = self._DATALESS_DTYPE - fill_value = self._DATALESS_FILLVALUE - elif not packing: - dtype = data.dtype.newbyteorder("=") - else: - if isinstance(packing, dict): + if not is_dataless: + data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) + if not packing: + dtype = data.dtype.newbyteorder("=") + elif isinstance(packing, dict): if "dtype" not in packing: msg = "The dtype attribute is required for packing." raise ValueError(msg) diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index 8f29fcdcd5..c8bba94671 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -3,9 +3,8 @@ import pytest import iris - -iris.FUTURE.save_split_attrs = True - +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube NX, N_STRLEN = 3, 64 TEST_STRINGS = ["Münster", "London", "Amsterdam"] @@ -17,7 +16,13 @@ TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one -def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): +@pytest.fixture(scope="module", autouse=True) +def enable_split_attrs(): + with iris.FUTURE.context(save_split_attrs=True): + yield + + +def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"): bbytes = [text.encode(encoding) for text in string_array_1d] pad = b"\0" * maxlen bbytes = [(x + pad)[:maxlen] for x in bbytes] @@ -97,6 +102,23 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): v.coordinates = coords_str +def make_testcube( + dataarray, + coordarray, # for now, these are always *string* arrays + encoding_str: str | None = None, +): + cube = Cube(dataarray, var_name="v") + cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0) + if encoding_str is not None: + cube.attributes["_Encoding"] = encoding_str + if INCLUDE_COORD: + co_x = AuxCoord(coordarray, var_name="v_co") + if encoding_str is not None: + co_x.attributes["_Encoding"] = encoding_str + cube.add_aux_coord(co_x, 0) + return cube + + def show_result(filepath): from pp_utils import ncdump @@ -115,12 +137,13 @@ def show_result(filepath): # print(repr(v[:])) print("\nAs iris cube..") try: + iris.loading.LOAD_PROBLEMS.reset() cube = iris.load_cube(filepath) print(cube) - if iris.loading.LOAD_PROBLEMS._problems: + if iris.loading.LOAD_PROBLEMS.problems: print(iris.loading.LOAD_PROBLEMS) print( - "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format()) + "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format()) ) print("-data-") print(repr(cube.data)) @@ -136,27 +159,54 @@ def show_result(filepath): print(repr(err)) -# tsts = (None, "ascii", "utf-8", "utf-32",) +tsts = ( + None, + "ascii", + "utf-8", + "utf-32", +) # tsts = ("utf-8",) # tsts = ("utf-8", "utf-32",) # tsts = ("utf-32",) -tsts = ("utf-8", "ascii", "utf-8") +# tsts = ("utf-8", "ascii", "utf-8") @pytest.mark.parametrize("encoding", tsts) -def test_encodings(encoding): +def test_load_encodings(encoding): # small change print(f"\n=========\nTesting encoding: {encoding}") filepath = f"tmp_{str(encoding)}.nc" do_as = encoding if encoding != "utf-32": do_as = "utf-8" - TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as) - TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as) + TEST_CHARARRAY = convert_strings_to_chararray( + TEST_STRINGS, N_STRLEN, encoding=do_as + ) + TEST_COORDARRAY = convert_strings_to_chararray( + TEST_COORD_VALS, N_STRLEN, encoding=do_as + ) make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) show_result(filepath) +@pytest.mark.parametrize("encoding", tsts) +def test_save_encodings(encoding): + cube = make_testcube( + dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding + ) + print(cube) + filepath = f"tmp_save_{str(encoding)}.nc" + if encoding == "ascii": + with pytest.raises( + UnicodeEncodeError, + match="'ascii' codec can't encode character.*not in range", + ): + iris.save(cube, filepath) + else: + iris.save(cube, filepath) + show_result(filepath) + + # @pytest.mark.parametrize("ndim", [1, 2]) # def test_convert_bytes_to_strings(ndim: int): # if ndim == 1: From 035e28b9785c99dc3ae0df7fda171a37cbc62121 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 15:21:31 +0000 Subject: [PATCH 07/43] Further fix to attribute ordering. --- lib/iris/fileformats/netcdf/saver.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index c2522d8867..f80cf154c3 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1687,25 +1687,25 @@ def _set_cf_var_attributes(self, cf_var, element): # whereas for data-variables (aka Cubes) it is the other way around. # Needed now that this routine is also called from _create_cf_data_variable. # TODO: when we can break things, rationalise these to be the same. - def add_units(): + def add_units_attr(): if cf_units.as_unit(units_str).is_udunits(): _setncattr(cf_var, "units", units_str) - def add_stdname(): + def add_names_attrs(): standard_name = element.standard_name if standard_name is not None: _setncattr(cf_var, "standard_name", standard_name) + long_name = element.long_name + if long_name is not None: + _setncattr(cf_var, "long_name", long_name) + if isinstance(element, Cube): - add_stdname() - add_units() + add_names_attrs() + add_units_attr() else: - add_units() - add_stdname() - - long_name = element.long_name - if long_name is not None: - _setncattr(cf_var, "long_name", long_name) + add_units_attr() + add_names_attrs() # Add the CF-netCDF calendar attribute. if element.units.calendar: From 80c4776b0c5f6cbfd8163d757e16e28124a9b199 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 18:08:35 +0000 Subject: [PATCH 08/43] Fixes for data packing. --- lib/iris/fileformats/netcdf/saver.py | 64 ++++++++++------------------ 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index f80cf154c3..8d66557cab 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1737,6 +1737,7 @@ def _create_generic_cf_array_var( element_dims=None, fill_value=None, compression_kwargs=None, + packing_controls: dict | None = None, is_dataless=False, ): """Create theCF-netCDF variable given dimensional_metadata. @@ -1864,7 +1865,10 @@ def _create_generic_cf_array_var( else: element_type = type(element).__name__ data = self._ensure_valid_dtype(data, element_type, element) - dtype = data.dtype.newbyteorder("=") + if not packing_controls: + dtype = data.dtype.newbyteorder("=") + else: + dtype = packing_controls["dtype"] # Check if this is a dim-coord. is_dimcoord = cube is not None and element in cube.dim_coords @@ -1897,6 +1901,10 @@ def _create_generic_cf_array_var( # Add the data to the CF-netCDF variable. if not is_dataless: + if packing_controls: + # We must set packing attributes (if any), before assigning values. + for key, value in packing_controls["attributes"]: + _setncattr(cf_var, key, value) self._lazy_stream_data(data=data, cf_var=cf_var) # Add names + units @@ -2331,11 +2339,10 @@ def _create_cf_data_variable( # Get the values in a form which is valid for the file format. is_dataless = cube.is_dataless() - if not is_dataless: + packing_controls = None + if packing and not is_dataless: data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) - if not packing: - dtype = data.dtype.newbyteorder("=") - elif isinstance(packing, dict): + if isinstance(packing, dict): if "dtype" not in packing: msg = "The dtype attribute is required for packing." raise ValueError(msg) @@ -2373,26 +2380,14 @@ def _create_cf_data_variable( else: add_offset = cmin + 2 ** (n - 1) * scale_factor - def set_packing_ncattrs(cfvar): - """Set netCDF packing attributes. - - NOTE: cfvar needs to be a _thread_safe_nc._ThreadSafeWrapper subclass. + packing_controls = { + "dtype": dtype, + "attributes": [ + ("scale_factor", scale_factor), + ("add_offset", add_offset), + ], + } - """ - assert hasattr(cfvar, "THREAD_SAFE_FLAG") - if packing: - if scale_factor: - _setncattr(cfvar, "scale_factor", scale_factor) - if add_offset: - _setncattr(cfvar, "add_offset", add_offset) - - # cf_name = self._get_element_variable_name(cube_or_mesh=None, element=cube) - # while cf_name in self._dataset.variables: - # cf_name = self._increment_name(cf_name) - # - # cf_var = self._dataset.createVariable( - # cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs - # ) # Create the cube CF-netCDF data variable with data payload. cf_name = self._create_generic_cf_array_var( cube, @@ -2401,28 +2396,13 @@ def set_packing_ncattrs(cfvar): element_dims=dimension_names, fill_value=fill_value, compression_kwargs=kwargs, + packing_controls=packing_controls, is_dataless=is_dataless, ) cf_var = self._dataset.variables[cf_name] - if not is_dataless: - set_packing_ncattrs(cf_var) - - # if cube.standard_name: - # _setncattr(cf_var, "standard_name", cube.standard_name) - # - # if cube.long_name: - # _setncattr(cf_var, "long_name", cube.long_name) - # - # if cube.units.is_udunits(): - # _setncattr(cf_var, "units", str(cube.units)) - # - # # Add the CF-netCDF calendar attribute. - # if cube.units.calendar: - # _setncattr(cf_var, "calendar", cube.units.calendar) - - # Set attributes: NB this part is cube-specific (not the same for components) - # - therefore 'set_cf_var_attributes' doesn't set attributes if element is a Cube + # Set general attrs: NB this part is cube-specific (not the same for components) + # - so 'set_cf_var_attributes' *doesn't* set these, if element is a Cube if iris.FUTURE.save_split_attrs: attr_names = cube.attributes.locals.keys() else: From d4d3ebd2ac7e6414a3f2c57912138f4c02cf1ed9 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sun, 7 Dec 2025 00:42:34 +0000 Subject: [PATCH 09/43] Latest test-chararrays. --- .../integration/netcdf/test_chararrays.py | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index c8bba94671..0eb211c8b0 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -1,10 +1,19 @@ -import netCDF4 as nc +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for string data handling.""" + +import subprocess + import numpy as np import pytest import iris from iris.coords import AuxCoord, DimCoord from iris.cube import Cube +from iris.fileformats.netcdf import _thread_safe_nc +from iris.tests import env_bin_path NX, N_STRLEN = 3, 64 TEST_STRINGS = ["Münster", "London", "Amsterdam"] @@ -16,6 +25,7 @@ TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one +# Ensure all tests run with "split attrs" turned on. @pytest.fixture(scope="module", autouse=True) def enable_split_attrs(): with iris.FUTURE.context(save_split_attrs=True): @@ -59,7 +69,8 @@ def convert_bytesarray_to_strings( def make_testfile(filepath, chararray, coordarray, encoding_str=None): - with nc.Dataset(filepath, "w") as ds: + ds = _thread_safe_nc.DatasetWrapper(filepath, "w") + try: ds.createDimension("x", NX) ds.createDimension("nstr", N_STRLEN) vx = ds.createVariable("x", int, dimensions=("x")) @@ -100,6 +111,8 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): if INCLUDE_NUMERIC_AUXCOORD: coords_str += " v_num" v.coordinates = coords_str + finally: + ds.close() def make_testcube( @@ -119,12 +132,19 @@ def make_testcube( return cube -def show_result(filepath): - from pp_utils import ncdump +NCDUMP_PATHSTR = str(env_bin_path("ncdump")) + + +def ncdump(nc_path: str, *args): + """Call ncdump to print a dump of a file.""" + call_args = [NCDUMP_PATHSTR, nc_path] + list(*args) + subprocess.run(call_args, check=True) + +def show_result(filepath): print(f"File {filepath}") print("NCDUMP:") - ncdump(filepath, "") + ncdump(filepath) # with nc.Dataset(filepath, "r") as ds: # v = ds.variables["v"] # print("\n----\nNetcdf data readback (basic)") @@ -159,6 +179,13 @@ def show_result(filepath): print(repr(err)) +@pytest.fixture(scope="session") +def save_dir(tmp_path_factory): + return tmp_path_factory.mktemp("save_files") + + +# TODO: the tests don't test things properly yet, they just exercise the code and print +# things for manual debugging. tsts = ( None, "ascii", @@ -172,10 +199,10 @@ def show_result(filepath): @pytest.mark.parametrize("encoding", tsts) -def test_load_encodings(encoding): +def test_load_encodings(encoding, save_dir): # small change print(f"\n=========\nTesting encoding: {encoding}") - filepath = f"tmp_{str(encoding)}.nc" + filepath = save_dir / f"tmp_load_{str(encoding)}.nc" do_as = encoding if encoding != "utf-32": do_as = "utf-8" @@ -190,12 +217,12 @@ def test_load_encodings(encoding): @pytest.mark.parametrize("encoding", tsts) -def test_save_encodings(encoding): +def test_save_encodings(encoding, save_dir): cube = make_testcube( dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding ) print(cube) - filepath = f"tmp_save_{str(encoding)}.nc" + filepath = save_dir / f"tmp_save_{str(encoding)}.nc" if encoding == "ascii": with pytest.raises( UnicodeEncodeError, @@ -205,19 +232,3 @@ def test_save_encodings(encoding): else: iris.save(cube, filepath) show_result(filepath) - - -# @pytest.mark.parametrize("ndim", [1, 2]) -# def test_convert_bytes_to_strings(ndim: int): -# if ndim == 1: -# source = convert_strings_to_chararray(TEST_STRINGS, 16) -# elif ndim == 2: -# source = np.stack([ -# convert_strings_to_chararray(TEST_STRINGS, 16), -# convert_strings_to_chararray(TEST_COORD_VALS, 16), -# ]) -# else: -# raise ValueError(f"Unexpected param ndim={ndim}.") -# # convert the strings to bytes -# result = convert_bytesarray_to_strings(source) -# print(result) From 3f10cc1c49d207422f6ce4c2e64b2b44d9f1513c Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sun, 7 Dec 2025 10:43:18 +0000 Subject: [PATCH 10/43] Fix search+replace error. --- lib/iris/fileformats/netcdf/saver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 8d66557cab..4766054142 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element): name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.elements(element): - # It is a regular cube elementinate. + if cube is not None and cube.coords(element): + # It is a regular cube coordinate. # Auto-generate a name based on the dims. name = "" for dim in cube.coord_dims(element): From ee2fe4ccac13c0968c0ff017cb8ef5da8498f852 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 14 Jan 2026 13:18:07 +0000 Subject: [PATCH 11/43] Tiny fix in crucial place! (merge error?). --- lib/iris/fileformats/netcdf/saver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 4766054142..3d7c1dee19 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1855,7 +1855,7 @@ def _create_generic_cf_array_var( new_data[index_slice] = list( "%- *s" % (string_dimension_depth, data[index]) ) - data = new_data + data = new_data else: # A normal (numeric) variable. # ensure a valid datatype for the file format. From 744826da3a41a2e727b05cff5ede3b8186dbc62c Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 14 Jan 2026 15:33:04 +0000 Subject: [PATCH 12/43] Extra mock property prevents weird test crashes. --- .../helpers/test_build_and_add_auxiliary_coordinate.py | 8 +++++--- .../helpers/test_build_and_add_dimension_coordinate.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py index a44986ec98..5ed3413409 100644 --- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py +++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py @@ -44,7 +44,9 @@ def setUp(self): self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar"), cf_data=cf_data), + cf_var=mock.Mock( + dimensions=("foo", "bar"), cf_data=cf_data, dtype=np.int32 + ), filename="DUMMY", cube_parts=dict(coordinates=[]), ) @@ -174,7 +176,7 @@ def setUp(self): self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar")), + cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.int32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) @@ -244,7 +246,7 @@ def setUp(self): # Create dummy pyke engine. self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar")), + cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.float32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py index a871c967ab..26e25a6d95 100644 --- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py +++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py @@ -50,7 +50,7 @@ def setUp(self): # Create dummy pyke engine. self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=("foo", "bar")), + cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.int32), filename="DUMMY", cube_parts=dict(coordinates=[]), ) From a3e1217345f52f5e882b31ebdf65e76f7fe406b9 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 14 Jan 2026 18:00:24 +0000 Subject: [PATCH 13/43] Fix another mock problem. --- .../fileformats/netcdf/saver/test_Saver.py | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py index 0905c3d2a9..0eb12d794c 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py +++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py @@ -261,9 +261,6 @@ def test_compression(self): ) cube.add_ancillary_variable(anc_coord, data_dims=data_dims) - patch = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable" - ) compression_kwargs = { "complevel": 9, "fletcher32": True, @@ -273,10 +270,16 @@ def test_compression(self): with self.temp_filename(suffix=".nc") as nc_path: with Saver(nc_path, "NETCDF4", compute=False) as saver: + createvar_spy = self.patch( + "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + # Use 'wraps' to allow the patched methods to function as normal + # - the patch object just acts as a 'spy' on its calls. + wraps=saver._dataset.createVariable, + ) saver.write(cube, **compression_kwargs) - self.assertEqual(5, patch.call_count) - result = self._filter_compression_calls(patch, compression_kwargs) + self.assertEqual(5, createvar_spy.call_count) + result = self._filter_compression_calls(createvar_spy, compression_kwargs) self.assertEqual(3, len(result)) self.assertEqual({cube.name(), aux_coord.name(), anc_coord.name()}, set(result)) @@ -294,9 +297,6 @@ def test_non_compression__shape(self): ) cube.add_ancillary_variable(anc_coord, data_dims=data_dims[1]) - patch = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable" - ) compression_kwargs = { "complevel": 9, "fletcher32": True, @@ -306,11 +306,17 @@ def test_non_compression__shape(self): with self.temp_filename(suffix=".nc") as nc_path: with Saver(nc_path, "NETCDF4", compute=False) as saver: + createvar_spy = self.patch( + "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + # Use 'wraps' to allow the patched methods to function as normal + # - the patch object just acts as a 'spy' on its calls. + wraps=saver._dataset.createVariable, + ) saver.write(cube, **compression_kwargs) - self.assertEqual(5, patch.call_count) + self.assertEqual(5, createvar_spy.call_count) result = self._filter_compression_calls( - patch, compression_kwargs, mismatch=True + createvar_spy, compression_kwargs, mismatch=True ) self.assertEqual(4, len(result)) # the aux coord and ancil variable are not compressed due to shape, and @@ -327,10 +333,6 @@ def test_non_compression__dtype(self): aux_coord = AuxCoord(data, var_name="non_compress_aux", units="1") cube.add_aux_coord(aux_coord, data_dims=data_dims) - patch = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable" - ) - patch.return_value = mock.MagicMock(dtype=np.dtype("S1")) compression_kwargs = { "complevel": 9, "fletcher32": True, @@ -340,11 +342,17 @@ def test_non_compression__dtype(self): with self.temp_filename(suffix=".nc") as nc_path: with Saver(nc_path, "NETCDF4", compute=False) as saver: + createvar_spy = self.patch( + "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + # Use 'wraps' to allow the patched methods to function as normal + # - the patch object just acts as a 'spy' on its calls. + wraps=saver._dataset.createVariable, + ) saver.write(cube, **compression_kwargs) - self.assertEqual(4, patch.call_count) + self.assertEqual(4, createvar_spy.call_count) result = self._filter_compression_calls( - patch, compression_kwargs, mismatch=True + createvar_spy, compression_kwargs, mismatch=True ) self.assertEqual(3, len(result)) # the aux coord is not compressed due to its string dtype, and From 1a4f2f29de1f69f56317d204d5388ff692ef103a Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 31 Oct 2025 15:38:04 +0000 Subject: [PATCH 14/43] Initial dataset wrappers. Rename; addin parts of old investigation; add temporary notes. --- .../netcdf/_bytecoding_datasets.py | 182 ++++++++++++++ .../fileformats/netcdf/_thread_safe_nc.py | 15 +- .../integration/netcdf/test_chararrays.py | 234 ++++++++++++++++++ .../fileformats/netcdf/encoding_tests.txt | 18 ++ .../netcdf/test_bytecoding_datasets.py | 14 ++ 5 files changed, 457 insertions(+), 6 deletions(-) create mode 100644 lib/iris/fileformats/netcdf/_bytecoding_datasets.py create mode 100644 lib/iris/tests/integration/netcdf/test_chararrays.py create mode 100644 lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py new file mode 100644 index 0000000000..41e801d103 --- /dev/null +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -0,0 +1,182 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Module providing to netcdf datasets with automatic character encoding. + +The requirement is to convert numpy fixed-width unicode arrays on writing to a variable +which is declared as a byte (character) array with a fixed-length string dimension. + +Numpy unicode string arrays are ones with dtypes of the form "U". +Numpy character variables have the dtype "S1", and map to a fixed-length "string +dimension". + +In principle, netCDF4 already performs these translations, but in practice current +releases are not functional for anything other than "ascii" encoding -- including UTF-8, +which is the most obvious and desirable "general" solution. + +There is also the question of whether we should like to implement UTF-8 as our default. +Current discussions on this are inconclusive and neither CF conventions nor the NetCDF +User Guide are definite on what possible values of "_Encoding" are, or what the effective +default is, even though they do both mention the "_Encoding" attribute as a potential +way to handle the issue. + +Because of this, we interpret as follows: + * when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to + decode bytes as UTF-8 + * when writing strings : in the absence of an "_Encoding" attribute (on the Iris + cube or coord object), we will attempt to encode data with "ascii" : If this fails, + it raise an error prompting the user to supply an "_Encoding" attribute. + +Where an "_Encoding" attribute is provided to Iris, we will honour it where possible, +identifying with "codecs.lookup" : This means we support the encodings in the Python +Standard Library, and the name aliases which it recognises. + +See: + +* known problems https://github.com/Unidata/netcdf4-python/issues/1440 +* suggestions for how this "ought" to work, discussed in the netcdf-c library + * https://github.com/Unidata/netcdf-c/issues/402 + +""" + +import codecs +import warnings + +import numpy as np + +from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper + + +def decode_bytesarray_to_stringarray( + byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None +) -> np.ndarray: + """Convert an array of bytes to an array of strings, with one less dimension. + + N.B. for now at least, we assume the string dim is **always the last one**. + If 'string_width' is not given, it is set to the final dimension of 'byte_array'. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_width is None: + string_width = bytes_shape[-1] + string_dtype = f"U{string_width}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +def encode_stringarray_as_bytearray( + data: np.ndarray, encoding=None, string_dimension_length: int | None = None +) -> np.ndarray: + """Encode strings as bytearray. + + Note: if 'string_dimension_length' is not given (None), it is set to the longest + encoded bytes element. If 'string_dimension_length' is specified, the last array + dimension is set to this and content strings are truncated or extended as required. + """ + element_shape = data.shape + max_length = 1 # this is a MINIMUM - i.e. not zero! + data_elements = np.zeros(element_shape, dtype=object) + for index in np.ndindex(element_shape): + data_element = data[index].encode(encoding=encoding) + element_length = len(data_element) + data_elements[index] = data_element + if element_length > max_length: + max_length = element_length + + if string_dimension_length is None: + string_dimension_length = max_length + + # We already encoded all the strings, but stored them in an object-array as + # we didn't yet know the fixed byte-length to convert to. + # Now convert to a fixed-width byte array with an extra string-length dimension + result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") + right_pad = b"\0" * string_dimension_length + for index in np.ndindex(element_shape): + bytes = data_elements[index] + bytes = (bytes + right_pad)[:string_dimension_length] + result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] + + return result + + +DEFAULT_ENCODING = "utf-8" + + +class EncodedVariable(VariableWrapper): + """A variable wrapper that translates variable data according to byte encodings.""" + + def __getitem__(self, keys): + if self.is_chardata(): + super().set_auto_chartostring(False) + + data = super().__getitem__(keys) + + if self.is_chardata(): + encoding = self.get_byte_encoding() + strlen = self.get_string_length() + data = decode_bytesarray_to_stringarray(data, encoding, strlen) + + return data + + def __setitem__(self, keys, data): + if self.is_chardata(): + encoding = self.get_byte_encoding() + strlen = self.get_string_length() + if encoding is not None: + data = encode_stringarray_as_bytearray(data, encoding, strlen) + else: + try: + # Check if all characters are valid ascii + data = encode_stringarray_as_bytearray(data, "ascii", strlen) + except UnicodeEncodeError: + data = encode_stringarray_as_bytearray( + data, DEFAULT_ENCODING, strlen + ) + # As this was necessary, record the new encoding on the variable + self.set_ncattr("_Encoding", DEFAULT_ENCODING) + msg = ( + f"Non-ascii data written to label variable {self.name}. " + f"Applied {DEFAULT_ENCODING!r} encoding, " + f"and set attribute _Encoding={DEFAULT_ENCODING!r}." + ) + warnings.warn(msg, UserWarning) + + super().set_auto_chartostring(False) + + super().__setitem__(keys, data) + + def is_chardata(self): + return np.issubdtype(self.dtype, np.bytes_) + + def get_encoding(self) -> str | None: + """Get the effective byte encoding to be used for this variable.""" + # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data + result = getattr(self, "_Encoding", None) + if result is not None: + try: + # Accept + normalise naming of encodings + result = codecs.lookup(result).name + # NOTE: if encoding does not suit data, errors can occur. + # For example, _Encoding = "ascii", with non-ascii content. + except LookupError: + # Replace some invalid setting with "safe"(ish) fallback. + msg = f"Unknown encoding for variable {self.name!r}: {result!r}" + warnings.warn(msg, UserWarning) + + return result + + def get_string_length(self): + """Return the string-length defined for this variable (or None).""" + return getattr(self, "iris_string_length", None) + + +class EncodedDataset(DatasetWrapper): + """A specialised DatasetWrapper whose variables perform byte encoding.""" + + VAR_WRAPPER_CLS = EncodedVariable diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 33183ef0fa..46b8609bb7 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper): CONTAINED_CLASS = netCDF4.Group # Note: will also accept a whole Dataset object, but that is OK. _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"] + # Class to use when creating variable wrappers (default=VariableWrapper). + # - needed to support _byte_encoded_data.EncodedDataset. + VAR_WRAPPER_CLS = VariableWrapper # All Group API that returns Dimension(s) is wrapped to instead return # DimensionWrapper(s). @@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]: """ with _GLOBAL_NETCDF4_LOCK: variables_ = self._contained_instance.variables - return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()} + return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()} def createVariable(self, *args, **kwargs) -> VariableWrapper: """Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK. @@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper: """ with _GLOBAL_NETCDF4_LOCK: new_variable = self._contained_instance.createVariable(*args, **kwargs) - return VariableWrapper.from_existing(new_variable) + return self.VAR_WRAPPER_CLS.from_existing(new_variable) def get_variables_by_attributes( self, *args, **kwargs @@ -234,7 +237,7 @@ def get_variables_by_attributes( variables_ = list( self._contained_instance.get_variables_by_attributes(*args, **kwargs) ) - return [VariableWrapper.from_existing(v) for v in variables_] + return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_] # All Group API that returns Group(s) is wrapped to instead return # GroupWrapper(s). @@ -252,7 +255,7 @@ def groups(self): """ with _GLOBAL_NETCDF4_LOCK: groups_ = self._contained_instance.groups - return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()} + return {k: self.__class__.from_existing(v) for k, v in groups_.items()} @property def parent(self): @@ -268,7 +271,7 @@ def parent(self): """ with _GLOBAL_NETCDF4_LOCK: parent_ = self._contained_instance.parent - return GroupWrapper.from_existing(parent_) + return self.__class__.from_existing(parent_) def createGroup(self, *args, **kwargs): """Call createGroup() from netCDF4.Group/Dataset. @@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs): """ with _GLOBAL_NETCDF4_LOCK: new_group = self._contained_instance.createGroup(*args, **kwargs) - return GroupWrapper.from_existing(new_group) + return self.__class__.from_existing(new_group) class DatasetWrapper(GroupWrapper): diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py new file mode 100644 index 0000000000..0eb211c8b0 --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -0,0 +1,234 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for string data handling.""" + +import subprocess + +import numpy as np +import pytest + +import iris +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube +from iris.fileformats.netcdf import _thread_safe_nc +from iris.tests import env_bin_path + +NX, N_STRLEN = 3, 64 +TEST_STRINGS = ["Münster", "London", "Amsterdam"] +TEST_COORD_VALS = ["bun", "éclair", "sandwich"] + +# VARS_COORDS_SHARE_STRING_DIM = True +VARS_COORDS_SHARE_STRING_DIM = False +if VARS_COORDS_SHARE_STRING_DIM: + TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one + + +# Ensure all tests run with "split attrs" turned on. +@pytest.fixture(scope="module", autouse=True) +def enable_split_attrs(): + with iris.FUTURE.context(save_split_attrs=True): + yield + + +def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"): + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +def convert_bytesarray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +INCLUDE_COORD = True +# INCLUDE_COORD = False + +INCLUDE_NUMERIC_AUXCOORD = True +# INCLUDE_NUMERIC_AUXCOORD = False + + +def make_testfile(filepath, chararray, coordarray, encoding_str=None): + ds = _thread_safe_nc.DatasetWrapper(filepath, "w") + try: + ds.createDimension("x", NX) + ds.createDimension("nstr", N_STRLEN) + vx = ds.createVariable("x", int, dimensions=("x")) + vx[:] = np.arange(NX) + if INCLUDE_COORD: + ds.createDimension("nstr2", N_STRLEN) + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2", + ), + ) + v_co[:] = coordarray + if encoding_str is not None: + v_co._Encoding = encoding_str + if INCLUDE_NUMERIC_AUXCOORD: + v_num = ds.createVariable( + "v_num", + float, + dimensions=("x",), + ) + v_num[:] = np.arange(NX) + v = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v[:] = chararray + if encoding_str is not None: + v._Encoding = encoding_str + if INCLUDE_COORD: + coords_str = "v_co" + if INCLUDE_NUMERIC_AUXCOORD: + coords_str += " v_num" + v.coordinates = coords_str + finally: + ds.close() + + +def make_testcube( + dataarray, + coordarray, # for now, these are always *string* arrays + encoding_str: str | None = None, +): + cube = Cube(dataarray, var_name="v") + cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0) + if encoding_str is not None: + cube.attributes["_Encoding"] = encoding_str + if INCLUDE_COORD: + co_x = AuxCoord(coordarray, var_name="v_co") + if encoding_str is not None: + co_x.attributes["_Encoding"] = encoding_str + cube.add_aux_coord(co_x, 0) + return cube + + +NCDUMP_PATHSTR = str(env_bin_path("ncdump")) + + +def ncdump(nc_path: str, *args): + """Call ncdump to print a dump of a file.""" + call_args = [NCDUMP_PATHSTR, nc_path] + list(*args) + subprocess.run(call_args, check=True) + + +def show_result(filepath): + print(f"File {filepath}") + print("NCDUMP:") + ncdump(filepath) + # with nc.Dataset(filepath, "r") as ds: + # v = ds.variables["v"] + # print("\n----\nNetcdf data readback (basic)") + # try: + # print(repr(v[:])) + # except UnicodeDecodeError as err: + # print(repr(err)) + # print("..raw:") + # v.set_auto_chartostring(False) + # print(repr(v[:])) + print("\nAs iris cube..") + try: + iris.loading.LOAD_PROBLEMS.reset() + cube = iris.load_cube(filepath) + print(cube) + if iris.loading.LOAD_PROBLEMS.problems: + print(iris.loading.LOAD_PROBLEMS) + print( + "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format()) + ) + print("-data-") + print(repr(cube.data)) + print("-numeric auxcoord data-") + print(repr(cube.coord("x").points)) + if INCLUDE_COORD: + print("-string auxcoord data-") + try: + print(repr(cube.coord("v_co").points)) + except Exception as err2: + print(repr(err2)) + except UnicodeDecodeError as err: + print(repr(err)) + + +@pytest.fixture(scope="session") +def save_dir(tmp_path_factory): + return tmp_path_factory.mktemp("save_files") + + +# TODO: the tests don't test things properly yet, they just exercise the code and print +# things for manual debugging. +tsts = ( + None, + "ascii", + "utf-8", + "utf-32", +) +# tsts = ("utf-8",) +# tsts = ("utf-8", "utf-32",) +# tsts = ("utf-32",) +# tsts = ("utf-8", "ascii", "utf-8") + + +@pytest.mark.parametrize("encoding", tsts) +def test_load_encodings(encoding, save_dir): + # small change + print(f"\n=========\nTesting encoding: {encoding}") + filepath = save_dir / f"tmp_load_{str(encoding)}.nc" + do_as = encoding + if encoding != "utf-32": + do_as = "utf-8" + TEST_CHARARRAY = convert_strings_to_chararray( + TEST_STRINGS, N_STRLEN, encoding=do_as + ) + TEST_COORDARRAY = convert_strings_to_chararray( + TEST_COORD_VALS, N_STRLEN, encoding=do_as + ) + make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) + show_result(filepath) + + +@pytest.mark.parametrize("encoding", tsts) +def test_save_encodings(encoding, save_dir): + cube = make_testcube( + dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding + ) + print(cube) + filepath = save_dir / f"tmp_save_{str(encoding)}.nc" + if encoding == "ascii": + with pytest.raises( + UnicodeEncodeError, + match="'ascii' codec can't encode character.*not in range", + ): + iris.save(cube, filepath) + else: + iris.save(cube, filepath) + show_result(filepath) diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt new file mode 100644 index 0000000000..bab04aa0c4 --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -0,0 +1,18 @@ + +forms in files: + * char chardata(dim1, dim2, strlen_xx); # char data + * string data(dim1, dim2); + +forms in numpy: + * np.ndarray(dtype="S1") # char data + * np.ndarray(dtype="Snn") # char data + * np.ndarray(dtype="Unn") # strings + * np.ndarray(dtype="") + +possibilities in createVariable: +""" + The datatype can be a numpy datatype object, or a string that describes a numpy dtype object ... + datatype can also be a CompoundType instance (for a structured, or compound array), a VLType instance (for a variable-length array), +** or the python str builtin (for a variable-length string array). +** Numpy string and unicode datatypes with length greater than one are aliases for str. +""" diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py new file mode 100644 index 0000000000..8b449c5912 --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -0,0 +1,14 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module.""" + +# import numpy as np +# import pytest +# +# from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset + + +class TestEncodedDataset: + """Test how GRIB_PARAM attributes convert to strings for storage in netcdf files.""" From 0148f437f1872cdda9f3d41ccb37291ce4cba893 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 3 Dec 2025 18:59:43 +0000 Subject: [PATCH 15/43] Various notes, choices + changes: Beginnings of encoded-dataset testing. --- .../netcdf/_bytecoding_datasets.py | 155 ++++++++---- .../integration/netcdf/test_chararrays.py | 7 +- .../fileformats/netcdf/encoding_tests.txt | 164 +++++++++++++ .../netcdf/test_bytecoding_datasets.py | 223 +++++++++++++++++- .../unit/fileformats/netcdf/test_nc_dtypes.py | 96 ++++++++ 5 files changed, 595 insertions(+), 50 deletions(-) create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 41e801d103..353f14d538 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -41,6 +41,8 @@ """ import codecs +import contextlib +import threading import warnings import numpy as np @@ -49,17 +51,18 @@ def decode_bytesarray_to_stringarray( - byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None + byte_array: np.ndarray, encoding: str, string_width: int ) -> np.ndarray: """Convert an array of bytes to an array of strings, with one less dimension. N.B. for now at least, we assume the string dim is **always the last one**. If 'string_width' is not given, it is set to the final dimension of 'byte_array'. """ + if np.ma.isMaskedArray(byte_array): + # netCDF4-python sees zeros as "missing" -- we don't need or want that + byte_array = byte_array.data bytes_shape = byte_array.shape var_shape = bytes_shape[:-1] - if string_width is None: - string_width = bytes_shape[-1] string_dtype = f"U{string_width}" result = np.empty(var_shape, dtype=string_dtype) for ndindex in np.ndindex(var_shape): @@ -70,16 +73,25 @@ def decode_bytesarray_to_stringarray( return result -def encode_stringarray_as_bytearray( +# +# TODO: remove? +# this older version is "overly flexible", less efficient and not needed here. +# +def flexi_encode_stringarray_as_bytearray( data: np.ndarray, encoding=None, string_dimension_length: int | None = None ) -> np.ndarray: """Encode strings as bytearray. Note: if 'string_dimension_length' is not given (None), it is set to the longest - encoded bytes element. If 'string_dimension_length' is specified, the last array + encoded bytes element, **OR** the dtype size, if that is greater. + If 'string_dimension_length' is specified, the last array dimension is set to this and content strings are truncated or extended as required. """ + if np.ma.isMaskedArray(data): + # netCDF4-python sees zeros as "missing" -- we don't need or want that + data = data.data element_shape = data.shape + # Encode all the strings + see which is longest max_length = 1 # this is a MINIMUM - i.e. not zero! data_elements = np.zeros(element_shape, dtype=object) for index in np.ndindex(element_shape): @@ -90,10 +102,15 @@ def encode_stringarray_as_bytearray( max_length = element_length if string_dimension_length is None: + # If the string length was not specified, it is the maximum encoded length + # (n-bytes), **or** the dtype string-length, if greater. string_dimension_length = max_length + array_string_length = int(str(data.dtype)[2:]) # Yuck. No better public way? + if array_string_length > string_dimension_length: + string_dimension_length = array_string_length - # We already encoded all the strings, but stored them in an object-array as - # we didn't yet know the fixed byte-length to convert to. + # We maybe *already* encoded all the strings above, but stored them in an + # object-array as we didn't yet know the fixed byte-length to convert to. # Now convert to a fixed-width byte array with an extra string-length dimension result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") right_pad = b"\0" * string_dimension_length @@ -105,58 +122,98 @@ def encode_stringarray_as_bytearray( return result -DEFAULT_ENCODING = "utf-8" +def encode_stringarray_as_bytearray( + data: np.ndarray, encoding: str, string_dimension_length: int +) -> np.ndarray: + """Encode strings as a bytes array.""" + element_shape = data.shape + result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") + right_pad = b"\0" * string_dimension_length + for index in np.ndindex(element_shape): + bytes = data[index].encode(encoding=encoding) + # It's all a bit nasty ... + bytes = (bytes + right_pad)[:string_dimension_length] + result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] + + return result + + +class NetcdfStringDecodeSetting(threading.local): + def __init__(self, perform_encoding: bool = True): + self.set(perform_encoding) + + def set(self, perform_encoding: bool): + self.perform_encoding = perform_encoding + + def __bool__(self): + return self.perform_encoding + + @contextlib.contextmanager + def context(self, perform_encoding: bool): + old_setting = self.perform_encoding + self.perform_encoding = perform_encoding + yield + self.perform_encoding = old_setting + + +DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting() +DEFAULT_READ_ENCODING = "utf-8" +DEFAULT_WRITE_ENCODING = "ascii" class EncodedVariable(VariableWrapper): """A variable wrapper that translates variable data according to byte encodings.""" def __getitem__(self, keys): - if self.is_chardata(): - super().set_auto_chartostring(False) + if self._is_chardata(): + # N.B. we never need to UNset this, as we totally control it + self._contained_instance.set_auto_chartostring(False) data = super().__getitem__(keys) - if self.is_chardata(): - encoding = self.get_byte_encoding() - strlen = self.get_string_length() - data = decode_bytesarray_to_stringarray(data, encoding, strlen) + if DECODE_TO_STRINGS_ON_READ and self._is_chardata(): + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice + strlen = self._get_string_length() + try: + data = decode_bytesarray_to_stringarray(data, encoding, strlen) + except UnicodeDecodeError as err: + msg = ( + f"Character data in variable {self.name!r} could not be decoded" + f"with the {encoding!r} encoding. This can be fixed by setting the " + "variable '_Encoding' attribute to suit the content." + ) + raise ValueError(msg) from err return data def __setitem__(self, keys, data): - if self.is_chardata(): - encoding = self.get_byte_encoding() - strlen = self.get_string_length() - if encoding is not None: - data = encode_stringarray_as_bytearray(data, encoding, strlen) - else: + if self._is_chardata(): + # N.B. we never need to UNset this, as we totally control it + self._contained_instance.set_auto_chartostring(False) + + encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING + # N.B. typically, write encoding default is "ascii" --> fails bad content + if data.dtype.kind == "U": try: - # Check if all characters are valid ascii - data = encode_stringarray_as_bytearray(data, "ascii", strlen) - except UnicodeEncodeError: - data = encode_stringarray_as_bytearray( - data, DEFAULT_ENCODING, strlen - ) - # As this was necessary, record the new encoding on the variable - self.set_ncattr("_Encoding", DEFAULT_ENCODING) + strlen = self._get_string_length() + data = encode_stringarray_as_bytearray(data, encoding, strlen) + except UnicodeEncodeError as err: msg = ( - f"Non-ascii data written to label variable {self.name}. " - f"Applied {DEFAULT_ENCODING!r} encoding, " - f"and set attribute _Encoding={DEFAULT_ENCODING!r}." + f"String data written to netcdf character variable {self.name!r} " + f"could not be represented in encoding {encoding!r}. This can be " + "fixed by setting a suitable variable '_Encoding' attribute, " + 'e.g. ._Encoding="UTF-8".' ) - warnings.warn(msg, UserWarning) - - super().set_auto_chartostring(False) + raise ValueError(msg) from err super().__setitem__(keys, data) - def is_chardata(self): + def _is_chardata(self): return np.issubdtype(self.dtype, np.bytes_) - def get_encoding(self) -> str | None: - """Get the effective byte encoding to be used for this variable.""" - # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data + def _get_encoding(self) -> str | None: + """Get the byte encoding defined for this variable (or None).""" result = getattr(self, "_Encoding", None) if result is not None: try: @@ -165,18 +222,32 @@ def get_encoding(self) -> str | None: # NOTE: if encoding does not suit data, errors can occur. # For example, _Encoding = "ascii", with non-ascii content. except LookupError: - # Replace some invalid setting with "safe"(ish) fallback. + # Unrecognised encoding name : handle this as just a warning msg = f"Unknown encoding for variable {self.name!r}: {result!r}" warnings.warn(msg, UserWarning) return result - def get_string_length(self): - """Return the string-length defined for this variable (or None).""" - return getattr(self, "iris_string_length", None) + def _get_string_length(self): + """Return the string-length defined for this variable.""" + if not hasattr(self, "_strlen"): + # Work out the string length from the parent dataset dimensions. + strlen = self.group().dimensions[self.dimensions[-1]].size + # Cache this on the variable -- but not as a netcdf attribute (!) + self.__dict__["_strlen"] = strlen + + return self._strlen + + def set_auto_chartostring(self, onoff: bool): + msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type." + raise TypeError(msg) class EncodedDataset(DatasetWrapper): """A specialised DatasetWrapper whose variables perform byte encoding.""" VAR_WRAPPER_CLS = EncodedVariable + + def set_auto_chartostring(self, onoff: bool): + msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type." + raise TypeError(msg) diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index 0eb211c8b0..4414444733 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -137,8 +137,11 @@ def make_testcube( def ncdump(nc_path: str, *args): """Call ncdump to print a dump of a file.""" - call_args = [NCDUMP_PATHSTR, nc_path] + list(*args) - subprocess.run(call_args, check=True) + call_args = [NCDUMP_PATHSTR, nc_path] + list(args) + bytes = subprocess.check_output(call_args) + text = bytes.decode("utf-8") + print(text) + return text def show_result(filepath): diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt index bab04aa0c4..e77427cd63 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -1,8 +1,95 @@ +=========== +Outstanding Qs +* What would we like to do with all this IN IRIS?? + - generally present as string arrays (Uxx) + - existing scheme of naming dims for length + re-using is quite cunning! + - choice of seeing actual character arrays as alternative to string conversions? + +* string length handling for load/save/roundtrip + - on SAVE, we need some control so we can create files which are compatible, + irrespective of the data (which currently we are not doing) + - ALSO this is wanted to ensure that multiple vars (e.g. string cubes or string coords) + will share the string dim -- instead of creating arbitrary different ones + - presumably, if encoding blows the max-len, we must get a warning/error + + - on LOAD, we may want to *capture* the actual original string dim length, so it can be + re-created on save (by some scheme, as per previous) -- i.e. enable roundtripping. + I don't really want to preserve the name of the string dim, but this could be a + slightly tender point. To consider also : the impact of this on the non-equivalence + of loaded cubes, if we use actual *attributes* to carry this info (see below). + - **if not** : just load data + convert to string arrays as seems best + - this will also lead to incompatible cubes. + + - on SAVE, in the absence of strlen-controls, what is a reasonable default choice? + - take longest encoded + - set nbytes = NEXPAND(encoding) * nchars + - sensible values would depend on the encoding... + : ascii -> 1 + : utf-8 -> 1 or 4 ??? + : utf-16 -> 2 or 4 ??? + : utf-32 -> 4 + + - on LOAD, in absence of strlen controls, how do we choose the result DTYPE (i.e. character length)? + - again, may depend on the encoding: + : ascii = "U" + : UTF-8 = "U" + : UTF-16 = "U" + : UTF-32 = "U" + - N.B. these are ll at least "safe" - i.e. won't lose characters + + +separately from these, there is the question of how the controls affect "normal" +cube operations. + - the easiest approach is to define a "special" attribute, + which can be set on any cube/component + - using the dtype-length of the data would be *possible*, in conjunction with the + above-proposed "default rules" for choosing strlen from the dtype. + But this might not round-trip in all cases. + +within the actual data arrays + - we can't really expect any different to what numpy does + - that is, the dtype-length of any element <= that of the array (and not ==) + this may be tricky, but we can't easily prevent it. + >>> a = np.array(['', 'a', 'bb']) + >>> a + array(['', 'a', 'bb'], dtype='>> a[0].dtype + dtype('>> a[1].dtype + dtype('>> a[2].dtype + dtype('>> a.dtype + dtype('>> + - likewise, we can't assign without possible truncation. + If you **want** to expand the supported width, can use ".astype()" first ? + + +======================== +========================= forms in files: * char chardata(dim1, dim2, strlen_xx); # char data * string data(dim1, dim2); +netcdf types: +(netcdf docs terms) + NC_BYTE 8-bit signed integer + NC_UBYTE 8-bit unsigned integer + NC_CHAR 8-bit character + NC_STRING variable length character string + +***NOTE*** there is no NC_UCHAR or "unsigned char" type + + +relevant numpy base types (scalar dtypes): + * "S" bytes : np.bytes_ == np.int8 + * "B" unsigned bytes : np.ubyte == np.uint8 + * 'i' ints : np.int_ + * 'u' unsigned ints : np.int_ + * "U" unicode string : np.str_ + forms in numpy: * np.ndarray(dtype="S1") # char data * np.ndarray(dtype="Snn") # char data @@ -16,3 +103,80 @@ possibilities in createVariable: ** or the python str builtin (for a variable-length string array). ** Numpy string and unicode datatypes with length greater than one are aliases for str. """ + +test types: + "i1" : np.int8 + "u1" : np.uint8 + "S1" : np.byte_ + "U1" : np.str_ + "S" : + "U" : with/without non-ascii content + +save all these to files... +outputs from "test_nc_dtypes.py" test run: + SPEC:i1 SAVED-AS:int8 byte RELOAD-AS:int8 + SPEC:u1 SAVED-AS:uint8 ubyte RELOAD-AS:uint8 + SPEC:S1 SAVED-AS:|S1 char RELOAD-AS: () + SPEC:U1 SAVED-AS: EncodedDataset: + """Create a test EncodedDataset linked to an actual file. + + * strlen becomes the string dimension (i.e. a number of *bytes*) + * a variable "vxs" is created + * If 'encoding' is given, the "vxs::_Encoding" attribute is created with this value + """ + ds = EncodedDataset(path, "w") + ds.createDimension("x", 3) + ds.createDimension("strlen", strlen) + v = ds.createVariable("vxs", "S1", ("x", "strlen")) + if encoding is not None: + v.setncattr("_Encoding", encoding) + return ds + + +def fetch_undecoded_var(path, varname): + # Open a path as a "normal" dataset, and return a given variable. + ds_normal = DatasetWrapper(path) + ds_normal._contained_instance.set_auto_chartostring(False) + v = ds_normal.variables[varname] + # Return a variable, rather than its data, so we can check attributes etc. + return v + + +class TestWriteStrings: + """Test how string data is saved to a file.""" + + def test_write_strings(self, encoding, tempdir): + # Create a dataset with the variable + path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc" + + if encoding in [None, "ascii"]: + writedata = samples_3_ascii + write_encoding = "ascii" + else: + writedata = samples_3_nonascii + write_encoding = encoding + + writedata = writedata.copy() # just for safety? + strlen = strings_maxbytes(writedata, write_encoding) + + ds_encoded = make_encoded_dataset(path, strlen, encoding) + v = ds_encoded.variables["vxs"] + + # Effectively, checks that we *can* write strings + v[:] = writedata + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + v = fetch_undecoded_var(path, "vxs") + + # Check that the raw result is as expected + bytes_result = v[:] + expected = encode_stringarray_as_bytearray(writedata, write_encoding, strlen) + assert ( + bytes_result.shape == expected.shape + and bytes_result.dtype == expected.dtype + and np.all(bytes_result == expected) + ) + + # Check that the "_Encoding" property is also as expected + result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None + assert result_attr == encoding + + def test_scalar(self, tempdir): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / "test_writestrings_scalar.nc" + + ds_encoded = make_encoded_dataset(path, strlen=5) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + # Checks that we *can* write a string + v[:] = np.array("stuff", dtype=str) + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + v = fetch_undecoded_var(path, "v0_scalar") + result = v[:] + + # Check that the raw result is as expected + assert ( + result.shape == (5,) + and result.dtype == " Date: Fri, 5 Dec 2025 12:51:04 +0000 Subject: [PATCH 16/43] Replace use of encoding functions with test-specific function: Test for overlength writes. --- .../netcdf/_bytecoding_datasets.py | 6 +- .../fileformats/netcdf/encoding_tests.txt | 15 +- .../netcdf/test_bytecoding_datasets.py | 194 ++++++++++++------ 3 files changed, 147 insertions(+), 68 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 353f14d538..62e1dd2ab7 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -123,9 +123,10 @@ def flexi_encode_stringarray_as_bytearray( def encode_stringarray_as_bytearray( - data: np.ndarray, encoding: str, string_dimension_length: int + data: np.typing.ArrayLike, encoding: str, string_dimension_length: int ) -> np.ndarray: """Encode strings as a bytes array.""" + data = np.asanyarray(data) element_shape = data.shape result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") right_pad = b"\0" * string_dimension_length @@ -179,7 +180,7 @@ def __getitem__(self, keys): data = decode_bytesarray_to_stringarray(data, encoding, strlen) except UnicodeDecodeError as err: msg = ( - f"Character data in variable {self.name!r} could not be decoded" + f"Character data in variable {self.name!r} could not be decoded " f"with the {encoding!r} encoding. This can be fixed by setting the " "variable '_Encoding' attribute to suit the content." ) @@ -188,6 +189,7 @@ def __getitem__(self, keys): return data def __setitem__(self, keys, data): + data = np.asanyarray(data) if self._is_chardata(): # N.B. we never need to UNset this, as we totally control it self._contained_instance.set_auto_chartostring(False) diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt index e77427cd63..5fa021ccdd 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -146,12 +146,17 @@ Then, as regards the _Encoding .. TO TEST... ========== -create a dataset + write char data - - X assign different encodings: makes no difference +NOTE on length control: + - not an API thing, it's implicit from when you create a variable + - this also applies to how it loads back + - BUT here there may be scope for a control attribute : -create a dataset + write STRING data - - X encoding=(ascii, utf-8, utf-32, None) - - X withnonascii=(T, F) ++++ create a dataset + write char data ++++ - X assign different encodings: makes no difference + ++++ create a dataset + write STRING data ++++ - X encoding=(ascii, utf-8, utf-32, None) ++++ - X withnonascii=(T, F) - X length=(long, short, none) read string data diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 092da19a00..411212b973 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -9,11 +9,7 @@ import numpy as np import pytest -from iris.fileformats.netcdf._bytecoding_datasets import ( - EncodedDataset, - encode_stringarray_as_bytearray, - flexi_encode_stringarray_as_bytearray, -) +from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper encoding_options = [None, "ascii", "utf-8", "utf-32"] @@ -66,8 +62,92 @@ def fetch_undecoded_var(path, varname): return v +def check_raw_content(path, varname, expected_byte_array): + v = fetch_undecoded_var(path, varname) + bytes_result = v[:] + assert ( + bytes_result.shape == expected_byte_array.shape + and bytes_result.dtype == expected_byte_array.dtype + and np.all(bytes_result == expected_byte_array) + ) + + +def _make_bytearray_inner(data, encoding): + # Convert to a (list of [lists of..]) strings or bytes to a + # (list of [lists of..]) length-1 bytes with an extra dimension. + if isinstance(data, str): + # Convert input strings to bytes + data = data.encode(encoding) + if isinstance(data, bytes): + # iterate over bytes to get a sequence of length-1 bytes (what np.array wants) + result = [data[i : i + 1] for i in range(len(data))] + else: + # If not string/bytes, expect the input to be a list. + # N.B. the recursion is inefficient, but we don't care about that here + result = [_make_bytearray_inner(part, encoding) for part in data] + return result + + +def make_bytearray(data, encoding="ascii"): + """Convert bytes or lists of bytes into a numpy byte array. + + This is largely to avoid using "encode_stringarray_as_bytearray", since we don't + want to depend on that when we should be testing it. + So, it mostly replicates the function of that, but it does also support bytes in the + input, and it automatically finds + applies the maximum bytes-lengths in the input. + """ + # First, Convert to a (list of [lists of]..) length-1 bytes objects + data = _make_bytearray_inner(data, encoding) + + # Numbers of bytes in the inner dimension are the lengths of bytes/strings input, + # so they aren't all the same. + # To enable array conversion, we fix that by expanding all to the max length + + def get_maxlen(data): + # Find the maximum number of bytes in the inner dimension. + if not isinstance(data, list): + # Inner bytes object + assert isinstance(data, bytes) + longest = len(data) + else: + # We have a list: either a list of bytes, or a list of lists. + if len(data) == 0 or not isinstance(data[0], list): + # inner-most list, should contain bytes if anything + assert len(data) == 0 or isinstance(data[0], bytes) + # return n-bytes + longest = len(data) + else: + # list of lists: return max length of sub-lists + longest = max(get_maxlen(part) for part in data) + return longest + + maxlen = get_maxlen(data) + + def extend_all_to_maxlen(data, length, filler=b"\0"): + # Extend each "innermost" list (of single bytes) to the required length + if isinstance(data, list): + if len(data) == 0 or not isinstance(data[0], list): + # Pad all the inner-most lists to the required number of elements + n_extra = length - len(data) + if n_extra > 0: + data = data + [filler] * n_extra + else: + data = [extend_all_to_maxlen(part, length, filler) for part in data] + return data + + data = extend_all_to_maxlen(data, maxlen) + # We should now be able to create an array of single bytes. + result = np.array(data) + assert result.dtype == " Date: Fri, 5 Dec 2025 14:47:54 +0000 Subject: [PATCH 17/43] Radically simplify 'make_bytesarray', by using a known specified bytewidth. --- .../netcdf/test_bytecoding_datasets.py | 76 ++++++------------- 1 file changed, 22 insertions(+), 54 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 411212b973..9ef354f850 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -72,7 +72,7 @@ def check_raw_content(path, varname, expected_byte_array): ) -def _make_bytearray_inner(data, encoding): +def _make_bytearray_inner(data, bytewidth, encoding): # Convert to a (list of [lists of..]) strings or bytes to a # (list of [lists of..]) length-1 bytes with an extra dimension. if isinstance(data, str): @@ -81,61 +81,25 @@ def _make_bytearray_inner(data, encoding): if isinstance(data, bytes): # iterate over bytes to get a sequence of length-1 bytes (what np.array wants) result = [data[i : i + 1] for i in range(len(data))] + # pad or truncate everything to the required bytewidth + result = (result + [b"\0"] * bytewidth)[:bytewidth] else: # If not string/bytes, expect the input to be a list. # N.B. the recursion is inefficient, but we don't care about that here - result = [_make_bytearray_inner(part, encoding) for part in data] + result = [_make_bytearray_inner(part, bytewidth, encoding) for part in data] return result -def make_bytearray(data, encoding="ascii"): +def make_bytearray(data, bytewidth, encoding="ascii"): """Convert bytes or lists of bytes into a numpy byte array. This is largely to avoid using "encode_stringarray_as_bytearray", since we don't want to depend on that when we should be testing it. So, it mostly replicates the function of that, but it does also support bytes in the - input, and it automatically finds + applies the maximum bytes-lengths in the input. + input. """ # First, Convert to a (list of [lists of]..) length-1 bytes objects - data = _make_bytearray_inner(data, encoding) - - # Numbers of bytes in the inner dimension are the lengths of bytes/strings input, - # so they aren't all the same. - # To enable array conversion, we fix that by expanding all to the max length - - def get_maxlen(data): - # Find the maximum number of bytes in the inner dimension. - if not isinstance(data, list): - # Inner bytes object - assert isinstance(data, bytes) - longest = len(data) - else: - # We have a list: either a list of bytes, or a list of lists. - if len(data) == 0 or not isinstance(data[0], list): - # inner-most list, should contain bytes if anything - assert len(data) == 0 or isinstance(data[0], bytes) - # return n-bytes - longest = len(data) - else: - # list of lists: return max length of sub-lists - longest = max(get_maxlen(part) for part in data) - return longest - - maxlen = get_maxlen(data) - - def extend_all_to_maxlen(data, length, filler=b"\0"): - # Extend each "innermost" list (of single bytes) to the required length - if isinstance(data, list): - if len(data) == 0 or not isinstance(data[0], list): - # Pad all the inner-most lists to the required number of elements - n_extra = length - len(data) - if n_extra > 0: - data = data + [filler] * n_extra - else: - data = [extend_all_to_maxlen(part, length, filler) for part in data] - return data - - data = extend_all_to_maxlen(data, maxlen) + data = _make_bytearray_inner(data, bytewidth, encoding) # We should now be able to create an array of single bytes. result = np.array(data) assert result.dtype == " Date: Fri, 5 Dec 2025 16:23:55 +0000 Subject: [PATCH 18/43] Add read tests. --- .../netcdf/_bytecoding_datasets.py | 38 +++- .../netcdf/test_bytecoding_datasets.py | 165 ++++++++++++++++-- 2 files changed, 184 insertions(+), 19 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 62e1dd2ab7..3bdc799d7f 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -175,7 +175,7 @@ def __getitem__(self, keys): if DECODE_TO_STRINGS_ON_READ and self._is_chardata(): encoding = self._get_encoding() or DEFAULT_READ_ENCODING # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice - strlen = self._get_string_length() + strlen = self._get_string_width() try: data = decode_bytesarray_to_stringarray(data, encoding, strlen) except UnicodeDecodeError as err: @@ -194,11 +194,11 @@ def __setitem__(self, keys, data): # N.B. we never need to UNset this, as we totally control it self._contained_instance.set_auto_chartostring(False) - encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING # N.B. typically, write encoding default is "ascii" --> fails bad content if data.dtype.kind == "U": try: - strlen = self._get_string_length() + encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING + strlen = self._get_byte_width() data = encode_stringarray_as_bytearray(data, encoding, strlen) except UnicodeEncodeError as err: msg = ( @@ -230,12 +230,36 @@ def _get_encoding(self) -> str | None: return result - def _get_string_length(self): + def _get_byte_width(self) -> int | None: + if not hasattr(self, "_bytewidth"): + n_bytes = self.group().dimensions[self.dimensions[-1]].size + # Cache this length control on the variable -- but not as a netcdf attribute + self.__dict__["_bytewidth"] = n_bytes + + return self.__dict__["_bytewidth"] + + def _get_string_width(self): """Return the string-length defined for this variable.""" if not hasattr(self, "_strlen"): - # Work out the string length from the parent dataset dimensions. - strlen = self.group().dimensions[self.dimensions[-1]].size - # Cache this on the variable -- but not as a netcdf attribute (!) + if hasattr(self, "iris_string_width"): + strlen = self.get_ncattr("iris_string_width") + else: + # Work out the actual byte width from the parent dataset dimensions. + strlen = self._get_byte_width() + # Convert the string dimension length (i.e. bytes) to a sufficiently-long + # string width, depending on the encoding used. + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # regularise the name for comparison with recognised ones + encoding = codecs.lookup(encoding).name + if "utf-16" in encoding: + # Each char needs at least 2 bytes -- including a terminator char + strlen = (strlen // 2) - 1 + elif "utf-32" in encoding: + # Each char needs exactly 4 bytes -- including a terminator char + strlen = (strlen // 4) - 1 + # "ELSE": assume there can be (at most) as many chars as bytes + + # Cache this length control on the variable -- but not as a netcdf attribute self.__dict__["_strlen"] = strlen return self._strlen diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 9ef354f850..5df511103f 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -9,7 +9,10 @@ import numpy as np import pytest -from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset +from iris.fileformats.netcdf._bytecoding_datasets import ( + DECODE_TO_STRINGS_ON_READ, + EncodedDataset, +) from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper encoding_options = [None, "ascii", "utf-8", "utf-32"] @@ -62,14 +65,17 @@ def fetch_undecoded_var(path, varname): return v +def check_array_matching(arr1, arr2): + """Check for arrays matching shape, dtype and content.""" + assert ( + arr1.shape == arr2.shape and arr1.dtype == arr2.dtype and np.all(arr1 == arr2) + ) + + def check_raw_content(path, varname, expected_byte_array): v = fetch_undecoded_var(path, varname) bytes_result = v[:] - assert ( - bytes_result.shape == expected_byte_array.shape - and bytes_result.dtype == expected_byte_array.dtype - and np.all(bytes_result == expected_byte_array) - ) + check_array_matching(bytes_result, expected_byte_array) def _make_bytearray_inner(data, bytewidth, encoding): @@ -102,7 +108,7 @@ def make_bytearray(data, bytewidth, encoding="ascii"): data = _make_bytearray_inner(data, bytewidth, encoding) # We should now be able to create an array of single bytes. result = np.array(data) - assert result.dtype == " string array + result = v[:] + expected = write_strings + if encoding == "utf-8": + # In this case, with the given non-ascii sample data, the + # "default minimum string length" is overestimated. + assert strlen == 7 and result.dtype == "U7" + # correct the result dtype to pass the write_strings comparison below + truncated_result = result.astype("U4") + # Also check that content is the same (i.e. not actually truncated) + assert np.all(truncated_result == result) + result = truncated_result + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = write_bytes + + check_array_matching(result, expected) + + def test_scalar(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / f"test_read_scalar_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + data_string = "stuff" + data_bytes = make_bytearray(data_string, 5) + + # Checks that we *can* write a string + v[:] = data_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(data_string) + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = data_bytes + + check_array_matching(result, expected) + + def test_multidim(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has additional dimensions. + path = tempdir / f"test_read_multidim_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + ds_encoded.createDimension("y", 2) + v = ds_encoded.createVariable( + "vyxn", + "S1", + ( + "y", + "x", + "strlen", + ), + ) + + # Check that we *can* write a multidimensional string array + test_strings = [ + ["one", "n", ""], + ["two", "xxxxx", "four"], + ] + test_bytes = make_bytearray(test_strings, strlen) + v[:] = test_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(test_strings) + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = test_bytes + + check_array_matching(result, expected) + + def test_read_encoding_failure(self, tempdir, readmode): + path = tempdir / f"test_read_encoding_failure_{readmode}.nc" + strlen = 10 + ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii") + v = ds.variables["vxs"] + test_utf8_bytes = make_bytearray( + samples_3_nonascii, bytewidth=strlen, encoding="utf-8" + ) + v[:] = test_utf8_bytes + + if readmode == "strings": + msg = ( + "Character data in variable 'vxs' could not be decoded " + "with the 'ascii' encoding." + ) + with pytest.raises(ValueError, match=msg): + v[:] + else: + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] # this ought to be ok! - def test_encodings(self, encoding): - pass + assert np.all(result == test_utf8_bytes) From cf048b242fe89ceadee2cdd144354b2a17bb33fb Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 5 Dec 2025 16:26:13 +0000 Subject: [PATCH 19/43] Remove iris width control (not in this layer). --- .../netcdf/_bytecoding_datasets.py | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 3bdc799d7f..5ed156f3ee 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -241,23 +241,20 @@ def _get_byte_width(self) -> int | None: def _get_string_width(self): """Return the string-length defined for this variable.""" if not hasattr(self, "_strlen"): - if hasattr(self, "iris_string_width"): - strlen = self.get_ncattr("iris_string_width") - else: - # Work out the actual byte width from the parent dataset dimensions. - strlen = self._get_byte_width() - # Convert the string dimension length (i.e. bytes) to a sufficiently-long - # string width, depending on the encoding used. - encoding = self._get_encoding() or DEFAULT_READ_ENCODING - # regularise the name for comparison with recognised ones - encoding = codecs.lookup(encoding).name - if "utf-16" in encoding: - # Each char needs at least 2 bytes -- including a terminator char - strlen = (strlen // 2) - 1 - elif "utf-32" in encoding: - # Each char needs exactly 4 bytes -- including a terminator char - strlen = (strlen // 4) - 1 - # "ELSE": assume there can be (at most) as many chars as bytes + # Work out the actual byte width from the parent dataset dimensions. + strlen = self._get_byte_width() + # Convert the string dimension length (i.e. bytes) to a sufficiently-long + # string width, depending on the encoding used. + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # regularise the name for comparison with recognised ones + encoding = codecs.lookup(encoding).name + if "utf-16" in encoding: + # Each char needs at least 2 bytes -- including a terminator char + strlen = (strlen // 2) - 1 + elif "utf-32" in encoding: + # Each char needs exactly 4 bytes -- including a terminator char + strlen = (strlen // 4) - 1 + # "ELSE": assume there can be (at most) as many chars as bytes # Cache this length control on the variable -- but not as a netcdf attribute self.__dict__["_strlen"] = strlen From e684d1d8b197ef9ce5d1862ea42b3cae41ec8100 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 5 Dec 2025 17:55:12 +0000 Subject: [PATCH 20/43] more notes --- .../fileformats/netcdf/encoding_tests.txt | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt index 5fa021ccdd..07a0bc3bcd 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -146,25 +146,21 @@ Then, as regards the _Encoding .. TO TEST... ========== -NOTE on length control: - - not an API thing, it's implicit from when you create a variable - - this also applies to how it loads back - - BUT here there may be scope for a control attribute : - +++ create a dataset + write char data +++ - X assign different encodings: makes no difference +++ create a dataset + write STRING data +++ - X encoding=(ascii, utf-8, utf-32, None) +++ - X withnonascii=(T, F) - - X length=(long, short, none) +XXXX - X length=(long, short, none) + ***deferred*** to layer above only -read string data - - X encoding=(ascii, utf-8, utf-32, None) - - X withnonascii=(T, F) ++++ read string data ++++ - X encoding=(ascii, utf-8, utf-32, None) ++++ - X withnonascii=(T, F) -read char data (with control) - - X different encodings: make no difference ++++ read char data (with control) ++++ - X different encodings: make no difference ==rethought== write strings @@ -185,3 +181,11 @@ write char data read char data - X encodings: don't matter +--- +NOTEs on length control: +not an API thing, it's implicit from when you create a variable +this also applies to how it loads back +BUT here there may be scope for a control attribute : + "iris_string_dim" - controls width on creation + reading back + + From a20cc45d8a0cea200d7f6fb3531b8383dfa74c10 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 19 Jan 2026 14:56:33 +0000 Subject: [PATCH 21/43] Remove temporary test code. --- .../unit/fileformats/netcdf/test_nc_dtypes.py | 96 ------------------- 1 file changed, 96 deletions(-) delete mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py deleted file mode 100644 index 0c5d2b279e..0000000000 --- a/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright Iris contributors -# -# This file is part of Iris and is released under the BSD license. -# See LICENSE in the root of the repository for full licensing details. -"""Temporary code to confirm how various numpy dtypes are stored in a netcdf file.""" - -import netCDF4 as nc -import numpy as np -import pytest - -from iris.tests.integration.netcdf.test_chararrays import ncdump - -# types = [ -# "i1", # np.int8 -# "u1", # np.uint8 -# "S1", # np.byte_ -# "U1", # np.str_ -# "S", # multibytes -# "U", # unicode strings, with/without non-ascii content -# ] - -samples = { - "i1": [-5, 7, 35], # np.int8 - "u1": [65, 67, 90], # np.uint8 - "S1": [b"A", b"B", b"Z"], # np.byte_ - "U1": ["A", "B", "C"], # np.str_ - "S": [b"one21", b"three", b""], # multibyte - "U": ["one", "éclair", "nine"], # unicode strings -} -sample_arrays = { - type_code: np.array(values, dtype=type_code) - for type_code, values in samples.items() -} - - -@pytest.fixture(scope="module") -def tmpdir(tmp_path_factory): - return tmp_path_factory.mktemp("netcdf") - - -def create_file(array: np.ndarray, path): - with nc.Dataset(str(path), "w") as ds: - ds.createDimension("x", 3) - v = ds.createVariable("vx", array.dtype, ("x",)) - # v.set_auto_chartostring(False) - v._Encoding = "UTF-8" if array.dtype.kind == "U" else "ascii" - v[:] = array - - -def get_loadback_array(path): - with nc.Dataset(str(path), "r") as ds: - v = ds.variables["vx"] - v.set_auto_chartostring(False) - result = v[:] - return result - - -@pytest.mark.parametrize("dtype", list(samples.keys())) -def test(tmpdir, dtype): - arr = sample_arrays[dtype] - print("\n---") - print(dtype) - path = tmpdir / f"tmp_{dtype}.nc" - create_file(arr, path) - ncdump(path, "-s") - loadback_array = get_loadback_array(path) - print(f" SPEC:{dtype} SAVED-AS:{arr.dtype} RELOAD-AS:{loadback_array.dtype}") - - -# from iris.tests import env_bin_path -# NCGEN_PATHSTR = str(env_bin_path("ncgen")) -# -# -# def ncgen(cdl_path, nc_path, *args): -# """Call ncdump to print a dump of a file.""" -# args = list(args) -# if not any(arg.startswith('-k') for arg in args): -# args[:0] = ["-k", "nc4"] # force netcdf4 -# call_args = [NCGEN_PATHSTR] + list(args) + [str(cdl_path), '-o', str(nc_path)] -# subprocess.check_call(call_args) -# -# -# def test_uchar(tmpdir): -# arr = sample_arrays["S1"] -# path = tmpdir / f"tmp_ichar.nc" -# create_file(arr, path) -# text = ncdump(path, "-s") -# text_u = text.replace("\t", " ") -# text_u = text_u.replace(" char ", " unsigned char ") -# cdl_path = tmpdir / f"tmp_uchar.cdl" -# with open(cdl_path, "w") as f_out: -# f_out.write(text_u) -# nc_path_2 = tmpdir / f"tmp_uchar.nc" -# ncgen(cdl_path, nc_path_2) -# loadback_array = get_loadback_array(nc_path_2) -# print(f" netcdf type 'uchar' LOADS-AS:{loadback_array.dtype}") From c995a8df4bfb59b44f1dba41ea6e6a62410ec1a4 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 19 Jan 2026 15:40:26 +0000 Subject: [PATCH 22/43] Use iris categorised warnings for unknown encodings. --- .../netcdf/_bytecoding_datasets.py | 19 +++++++--- .../netcdf/test_bytecoding_datasets.py | 36 ++++++++++++++++--- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 5ed156f3ee..f1fe184729 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -48,6 +48,8 @@ import numpy as np from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper +import iris.warnings +from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning def decode_bytesarray_to_stringarray( @@ -197,7 +199,9 @@ def __setitem__(self, keys, data): # N.B. typically, write encoding default is "ascii" --> fails bad content if data.dtype.kind == "U": try: - encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING + encoding = ( + self._get_encoding(writing=True) or DEFAULT_WRITE_ENCODING + ) strlen = self._get_byte_width() data = encode_stringarray_as_bytearray(data, encoding, strlen) except UnicodeEncodeError as err: @@ -214,7 +218,7 @@ def __setitem__(self, keys, data): def _is_chardata(self): return np.issubdtype(self.dtype, np.bytes_) - def _get_encoding(self) -> str | None: + def _get_encoding(self, writing=False) -> str | None: """Get the byte encoding defined for this variable (or None).""" result = getattr(self, "_Encoding", None) if result is not None: @@ -225,9 +229,14 @@ def _get_encoding(self) -> str | None: # For example, _Encoding = "ascii", with non-ascii content. except LookupError: # Unrecognised encoding name : handle this as just a warning - msg = f"Unknown encoding for variable {self.name!r}: {result!r}" - warnings.warn(msg, UserWarning) - + msg = ( + f"Ignoring unknown encoding for variable {self.name!r}: " + f"_Encoding = {result!r}." + ) + warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning + warnings.warn(msg, warntype) + # Proceed as if there is no specified encoding + result = None return result def _get_byte_width(self) -> int | None: diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 5df511103f..861ec2c516 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -14,6 +14,7 @@ EncodedDataset, ) from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper +from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning encoding_options = [None, "ascii", "utf-8", "utf-32"] @@ -194,17 +195,29 @@ def test_multidim(self, tempdir): expected_bytes = make_bytearray(test_data, strlen) check_raw_content(path, "vyxn", expected_bytes) - def test_write_encoding_failure(self, tempdir): - path = tempdir / "test_writestrings_encoding_failure.nc" - ds = make_encoded_dataset(path, strlen=5, encoding="ascii") + @pytest.mark.parametrize("encoding", [None, "ascii"]) + def test_write_encoding_failure(self, tempdir, encoding): + path = tempdir / f"test_writestrings_encoding_{encoding}_fail.nc" + ds = make_encoded_dataset(path, strlen=5, encoding=encoding) v = ds.variables["vxs"] + encoding_name = encoding + if encoding_name == None: + encoding_name = "ascii" msg = ( "String data written to netcdf character variable 'vxs'.*" - " could not be represented in encoding 'ascii'. " + f" could not be represented in encoding '{encoding_name}'. " ) with pytest.raises(ValueError, match=msg): v[:] = samples_3_nonascii + def test_write_badencoding_ignore(self, tempdir): + path = tempdir / "test_writestrings_badencoding_ignore.nc" + ds = make_encoded_dataset(path, strlen=5, encoding="unknown") + v = ds.variables["vxs"] + msg = r"Ignoring unknown encoding for variable 'vxs': _Encoding = 'unknown'\." + with pytest.warns(IrisCfSaveWarning, match=msg): + v[:] = samples_3_ascii # will work OK + def test_overlength(self, tempdir): # Check expected behaviour with over-length data path = tempdir / "test_writestrings_overlength.nc" @@ -404,3 +417,18 @@ def test_read_encoding_failure(self, tempdir, readmode): result = v[:] # this ought to be ok! assert np.all(result == test_utf8_bytes) + + def test_read_badencoding_ignore(self, tempdir): + path = tempdir / f"test_read_badencoding_ignore.nc" + strlen = 10 + ds = make_encoded_dataset(path, strlen=strlen, encoding="unknown") + v = ds.variables["vxs"] + test_utf8_bytes = make_bytearray( + samples_3_nonascii, bytewidth=strlen, encoding="utf-8" + ) + v[:] = test_utf8_bytes + + msg = r"Ignoring unknown encoding for variable 'vxs': _Encoding = 'unknown'\." + with pytest.warns(IrisCfLoadWarning, match=msg): + # raises warning but succeeds, due to default read encoding of 'utf-8' + v[:] From f118c18117c1d02c1beac968d45fda75bff88103 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 19 Jan 2026 15:54:46 +0000 Subject: [PATCH 23/43] Clarify the temporary load/save exercising tests (a bit). --- .../tests/integration/netcdf/test_chararrays.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index 4414444733..3a4a3e1879 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -189,7 +189,7 @@ def save_dir(tmp_path_factory): # TODO: the tests don't test things properly yet, they just exercise the code and print # things for manual debugging. -tsts = ( +test_encodings = ( None, "ascii", "utf-8", @@ -201,8 +201,13 @@ def save_dir(tmp_path_factory): # tsts = ("utf-8", "ascii", "utf-8") -@pytest.mark.parametrize("encoding", tsts) +@pytest.mark.parametrize("encoding", test_encodings) def test_load_encodings(encoding, save_dir): + """Load exercise. + + Make a testfile with utf-8 content, variously labelled. + Load with Iris + show result (error or cubes). + """ # small change print(f"\n=========\nTesting encoding: {encoding}") filepath = save_dir / f"tmp_load_{str(encoding)}.nc" @@ -219,8 +224,13 @@ def test_load_encodings(encoding, save_dir): show_result(filepath) -@pytest.mark.parametrize("encoding", tsts) +@pytest.mark.parametrize("encoding", test_encodings) def test_save_encodings(encoding, save_dir): + """Save exercise. + + Make test-cube with non-ascii content, and various '_Encoding' labels. + Save with Iris + show result (error or ncdump). + """ cube = make_testcube( dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding ) From c8a27df7e2f38640d9f963d5d3fcad626f85c509 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sat, 17 Jan 2026 18:11:44 +0000 Subject: [PATCH 24/43] Use bytecoded_datasets in nc load+save, begin fixes. --- lib/iris/fileformats/cf.py | 6 +- .../netcdf/_bytecoding_datasets.py | 29 +++++++- .../fileformats/netcdf/_thread_safe_nc.py | 7 +- lib/iris/fileformats/netcdf/loader.py | 4 +- lib/iris/fileformats/netcdf/saver.py | 67 +++++++++++-------- .../integration/netcdf/test_chararrays.py | 13 +++- 6 files changed, 87 insertions(+), 39 deletions(-) diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 5abc525109..6e4b8f99e1 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -26,7 +26,7 @@ import iris.exceptions import iris.fileformats._nc_load_rules.helpers as hh -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets from iris.mesh.components import Connectivity import iris.util import iris.warnings @@ -1373,7 +1373,9 @@ def __init__(self, file_source, warn=False, monotonic=False): if isinstance(file_source, str): # Create from filepath : open it + own it (=close when we die). self._filename = os.path.expanduser(file_source) - self._dataset = _thread_safe_nc.DatasetWrapper(self._filename, mode="r") + self._dataset = _bytecoding_datasets.EncodedDataset( + self._filename, mode="r" + ) self._own_file = True else: # We have been passed an open dataset. diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index f1fe184729..a8dfca2b21 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -47,7 +47,12 @@ import numpy as np -from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper +from iris.fileformats.netcdf._thread_safe_nc import ( + DatasetWrapper, + NetCDFDataProxy, + NetCDFWriteProxy, + VariableWrapper, +) import iris.warnings from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning @@ -133,7 +138,19 @@ def encode_stringarray_as_bytearray( result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") right_pad = b"\0" * string_dimension_length for index in np.ndindex(element_shape): - bytes = data[index].encode(encoding=encoding) + string = data[index] + bytes = string.encode(encoding=encoding) + n_bytes = len(bytes) + # TODO: may want to issue warning or error if we overflow the length? + if n_bytes > string_dimension_length: + from iris.exceptions import TranslationError + + msg = ( + f"Non-ascii string {string!r} written to netcdf exceeds string " + f"dimension : {n_bytes} > {string_dimension_length}." + ) + raise TranslationError(msg) + # It's all a bit nasty ... bytes = (bytes + right_pad)[:string_dimension_length] result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] @@ -283,3 +300,11 @@ class EncodedDataset(DatasetWrapper): def set_auto_chartostring(self, onoff: bool): msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type." raise TypeError(msg) + + +class EncodedNetCDFDataProxy(NetCDFDataProxy): + DATASET_CLASS = EncodedDataset + + +class EncodedNetCDFWriteProxy(NetCDFWriteProxy): + DATASET_CLASS = EncodedDataset diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 46b8609bb7..cd97452dac 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -315,6 +315,7 @@ class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") + DATASET_CLASS = netCDF4.Dataset def __init__(self, shape, dtype, path, variable_name, fill_value): self.shape = shape @@ -337,7 +338,7 @@ def __getitem__(self, keys): # netCDF4 library, presumably because __getitem__ gets called so many # times by Dask. Use _GLOBAL_NETCDF4_LOCK directly instead. with _GLOBAL_NETCDF4_LOCK: - dataset = netCDF4.Dataset(self.path) + dataset = self.DATASET_CLASS(self.path) try: variable = dataset.variables[self.variable_name] # Get the NetCDF variable data and slice. @@ -374,6 +375,8 @@ class NetCDFWriteProxy: TODO: could be improved with a caching scheme, but this just about works. """ + DATASET_CLASS = netCDF4.Dataset + def __init__(self, filepath, cf_var, file_write_lock): self.path = filepath self.varname = cf_var.name @@ -401,7 +404,7 @@ def __setitem__(self, keys, array_data): # investigation needed. for attempt in range(5): try: - dataset = netCDF4.Dataset(self.path, "r+") + dataset = self.DATASET_CLASS(self.path, "r+") break except OSError: if attempt < 4: diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 219f681e67..d363e29738 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -36,7 +36,7 @@ import iris.coord_systems import iris.coords import iris.fileformats.cf -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets, _thread_safe_nc from iris.fileformats.netcdf.saver import _CF_ATTRS import iris.io import iris.util @@ -50,7 +50,7 @@ # An expected part of the public loader API, but includes thread safety # concerns so is housed in _thread_safe_nc. -NetCDFDataProxy = _thread_safe_nc.NetCDFDataProxy +NetCDFDataProxy = _bytecoding_datasets.EncodedNetCDFDataProxy class _WarnComboIgnoringBoundsLoad( diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 3d7c1dee19..f832ad1e8d 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -14,6 +14,7 @@ """ +import codecs import collections from itertools import repeat, zip_longest import os @@ -48,7 +49,8 @@ from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, DimCoord import iris.exceptions import iris.fileformats.cf -from iris.fileformats.netcdf import _dask_locks, _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets as bytecoding_datasets +from iris.fileformats.netcdf import _dask_locks from iris.fileformats.netcdf._attribute_handlers import ATTRIBUTE_HANDLERS import iris.io import iris.util @@ -300,7 +302,7 @@ class VariableEmulator(typing.Protocol): shape: tuple[int, ...] -CFVariable = typing.Union[_thread_safe_nc.VariableWrapper, VariableEmulator] +CFVariable = typing.Union[bytecoding_datasets.VariableWrapper, VariableEmulator] class Saver: @@ -403,7 +405,7 @@ def __init__(self, filename, netcdf_format, compute=True): # Put it inside a _thread_safe_nc wrapper to ensure thread-safety. # Except if it already is one, since they forbid "re-wrapping". if not hasattr(self._dataset, "THREAD_SAFE_FLAG"): - self._dataset = _thread_safe_nc.DatasetWrapper.from_existing( + self._dataset = bytecoding_datasets.DatasetWrapper.from_existing( self._dataset ) @@ -414,7 +416,7 @@ def __init__(self, filename, netcdf_format, compute=True): # Given a filepath string/path : create a dataset from that try: self.filepath = os.path.abspath(filename) - self._dataset = _thread_safe_nc.DatasetWrapper( + self._dataset = bytecoding_datasets.EncodedDataset( self.filepath, mode="w", format=netcdf_format ) except RuntimeError: @@ -1818,7 +1820,15 @@ def _create_generic_cf_array_var( # Typically CF label variables, but also possibly ancil-vars ? string_dimension_depth = data.dtype.itemsize if data.dtype.kind == "U": - string_dimension_depth //= 4 + encoding = element.attributes.get("_Encoding", "ascii") + # TODO: this can fail -- use a sensible warning + default? + encoding = codecs.lookup(encoding).name + if encoding == "utf-32": + # UTF-32 is a special case -- always 4 exactly bytes per char, plus 4 + string_dimension_depth += 4 + else: + # generally, 4 bytes per char in numpy --> make bytewidth = string-width + string_dimension_depth //= 4 string_dimension_name = "string%d" % string_dimension_depth # Determine whether to create the string length dimension. @@ -1837,25 +1847,25 @@ def _create_generic_cf_array_var( # Create the label coordinate variable. cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims) - # Convert data from an array of strings into a character array - # with an extra string-length dimension. - if len(element_dims) == 1: - # Scalar variable (only has string dimension). - data_first = data[0] - if is_lazy_data(data_first): - data_first = dask.compute(data_first) - data = list("%- *s" % (string_dimension_depth, data_first)) - else: - # NOTE: at present, can't do this lazily?? - orig_shape = data.shape - new_shape = orig_shape + (string_dimension_depth,) - new_data = np.zeros(new_shape, cf_var.dtype) - for index in np.ndindex(orig_shape): - index_slice = tuple(list(index) + [slice(None, None)]) - new_data[index_slice] = list( - "%- *s" % (string_dimension_depth, data[index]) - ) - data = new_data + # # Convert data from an array of strings into a character array + # # with an extra string-length dimension. + # if len(element_dims) == 1: + # # Scalar variable (only has string dimension). + # data_first = data[0] + # if is_lazy_data(data_first): + # data_first = dask.compute(data_first) + # data = list("%- *s" % (string_dimension_depth, data_first)) + # else: + # # NOTE: at present, can't do this lazily?? + # orig_shape = data.shape + # new_shape = orig_shape + (string_dimension_depth,) + # new_data = np.zeros(new_shape, cf_var.dtype) + # for index in np.ndindex(orig_shape): + # index_slice = tuple(list(index) + [slice(None, None)]) + # new_data[index_slice] = list( + # "%- *s" % (string_dimension_depth, data[index]) + # ) + # data = new_data else: # A normal (numeric) variable. # ensure a valid datatype for the file format. @@ -1899,6 +1909,10 @@ def _create_generic_cf_array_var( element, cf_var, cf_name, compression_kwargs=compression_kwargs ) + # Add names + units + # NOTE: *must* now do first, as we may need '_Encoding' set to write it ! + self._set_cf_var_attributes(cf_var, element) + # Add the data to the CF-netCDF variable. if not is_dataless: if packing_controls: @@ -1907,9 +1921,6 @@ def _create_generic_cf_array_var( _setncattr(cf_var, key, value) self._lazy_stream_data(data=data, cf_var=cf_var) - # Add names + units - self._set_cf_var_attributes(cf_var, element) - return cf_name def _create_cf_cell_methods(self, cube, dimension_names): @@ -2529,7 +2540,7 @@ def store( ) -> None: # Create a data-writeable object that we can stream into, which # encapsulates the file to be opened + variable to be written. - write_wrapper = _thread_safe_nc.NetCDFWriteProxy( + write_wrapper = bytecoding_datasets.EncodedNetCDFWriteProxy( self.filepath, cf_var, self.file_write_lock ) # Add to the list of delayed writes, used in delayed_completion(). diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index 3a4a3e1879..f3bba81c70 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -12,7 +12,9 @@ import iris from iris.coords import AuxCoord, DimCoord from iris.cube import Cube -from iris.fileformats.netcdf import _thread_safe_nc +from iris.fileformats.netcdf import _bytecoding_datasets + +# from iris.fileformats.netcdf import _thread_safe_nc from iris.tests import env_bin_path NX, N_STRLEN = 3, 64 @@ -22,7 +24,8 @@ # VARS_COORDS_SHARE_STRING_DIM = True VARS_COORDS_SHARE_STRING_DIM = False if VARS_COORDS_SHARE_STRING_DIM: - TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one + # Fix length so that the max coord strlen will be same as data one + TEST_COORD_VALS[-1] = "Xsandwich" # Ensure all tests run with "split attrs" turned on. @@ -68,8 +71,12 @@ def convert_bytesarray_to_strings( # INCLUDE_NUMERIC_AUXCOORD = False +# DATASET_CLASS = _thread_safe_nc.DatasetWrapper +DATASET_CLASS = _bytecoding_datasets.EncodedDataset + + def make_testfile(filepath, chararray, coordarray, encoding_str=None): - ds = _thread_safe_nc.DatasetWrapper(filepath, "w") + ds = DATASET_CLASS(filepath, "w") try: ds.createDimension("x", NX) ds.createDimension("nstr", N_STRLEN) From c4a31a48c45da667c003aad02fc3caeead58474f Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 19 Jan 2026 16:18:29 +0000 Subject: [PATCH 25/43] Further attempt to satisfy warning cateogry checker. --- lib/iris/fileformats/netcdf/_bytecoding_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index a8dfca2b21..52e2fe2aa5 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -251,7 +251,7 @@ def _get_encoding(self, writing=False) -> str | None: f"_Encoding = {result!r}." ) warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning - warnings.warn(msg, warntype) + warnings.warn(msg, category=warntype) # Proceed as if there is no specified encoding result = None return result From 10831d77b1c8743caa2eb8fb0baaa3294e6c4842 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 19 Jan 2026 16:41:46 +0000 Subject: [PATCH 26/43] Fix overlength error tests. --- .../netcdf/_bytecoding_datasets.py | 4 ++-- .../netcdf/test_bytecoding_datasets.py | 20 +++++++++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 52e2fe2aa5..a3a13f86f5 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -146,8 +146,8 @@ def encode_stringarray_as_bytearray( from iris.exceptions import TranslationError msg = ( - f"Non-ascii string {string!r} written to netcdf exceeds string " - f"dimension : {n_bytes} > {string_dimension_length}." + f"String {string!r} written to netcdf exceeds string dimension after " + f"encoding : {n_bytes} > {string_dimension_length}." ) raise TranslationError(msg) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 861ec2c516..4909d976de 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from iris.exceptions import TranslationError from iris.fileformats.netcdf._bytecoding_datasets import ( DECODE_TO_STRINGS_ON_READ, EncodedDataset, @@ -224,9 +225,9 @@ def test_overlength(self, tempdir): strlen = 5 ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii") v = ds.variables["vxs"] - v[:] = ["1", "123456789", "two"] - expected_bytes = make_bytearray(["1", "12345", "two"], strlen) - check_raw_content(path, "vxs", expected_bytes) + msg = r"String .* written to netcdf exceeds string dimension .* : [0-9]* > 5\." + with pytest.raises(TranslationError, match=msg): + v[:] = ["1", "123456789", "two"] def test_overlength_splitcoding(self, tempdir): # Check expected behaviour when non-ascii multibyte coding gets truncated @@ -234,7 +235,18 @@ def test_overlength_splitcoding(self, tempdir): strlen = 5 ds = make_encoded_dataset(path, strlen=strlen, encoding="utf-8") v = ds.variables["vxs"] - v[:] = ["1", "1234ü", "two"] + # Note: we must do the assignment as a single byte array, to avoid hitting the + # safety check for this exact problem : see previous check. + byte_arrays = [ + string.encode("utf-8")[:strlen] for string in ("1", "1234ü", "two") + ] + nd_bytes_array = np.array( + [ + [bytes[i : i + 1] if i < len(bytes) else b"\0" for i in range(strlen)] + for bytes in byte_arrays + ] + ) + v[:] = nd_bytes_array # This creates a problem: it won't read back msg = ( "Character data in variable 'vxs' could not be decoded " From 042028e481c4b5a01073c9aca8d296ad51c56895 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 19 Jan 2026 17:10:04 +0000 Subject: [PATCH 27/43] Get temporary iris load/save exercises working (todo: proper tests). --- lib/iris/fileformats/netcdf/saver.py | 6 ++++++ .../integration/netcdf/test_chararrays.py | 20 +++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index f832ad1e8d..d43df538c2 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1713,6 +1713,12 @@ def add_names_attrs(): if element.units.calendar: _setncattr(cf_var, "calendar", str(element.units.calendar)) + # Most attributes are dealt with later. + # But _Encoding need to be defined before we can write to a character variable + if element.dtype.kind in "SU" and "_Encoding" in element.attributes: + encoding = element.attributes.pop("_Encoding") + _setncattr(cf_var, "_Encoding", encoding) + if not isinstance(element, Cube): # Add any other custom coordinate attributes. # N.B. not Cube, which has specific handling in _create_cf_data_variable diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index f3bba81c70..496867ee8a 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -218,6 +218,7 @@ def test_load_encodings(encoding, save_dir): # small change print(f"\n=========\nTesting encoding: {encoding}") filepath = save_dir / f"tmp_load_{str(encoding)}.nc" + # Actual content is always either utf-8 or utf-32 do_as = encoding if encoding != "utf-32": do_as = "utf-8" @@ -228,7 +229,14 @@ def test_load_encodings(encoding, save_dir): TEST_COORD_VALS, N_STRLEN, encoding=do_as ) make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) - show_result(filepath) + if encoding == "ascii": + # If explicitly labelled as ascii, 'utf-8' data will fail to load back ... + msg = r"Character data .* could not be decoded with the 'ascii' encoding\." + with pytest.raises(ValueError, match=msg): + show_result(filepath) + else: + # ... otherwise, utf-8 data loads even without a label, as 'utf-8' default used + show_result(filepath) @pytest.mark.parametrize("encoding", test_encodings) @@ -243,10 +251,14 @@ def test_save_encodings(encoding, save_dir): ) print(cube) filepath = save_dir / f"tmp_save_{str(encoding)}.nc" - if encoding == "ascii": + if encoding in ("ascii", None): + msg = ( + "String data written to netcdf character variable 'v' " + "could not be represented in encoding 'ascii'" + ) with pytest.raises( - UnicodeEncodeError, - match="'ascii' codec can't encode character.*not in range", + ValueError, + match=msg, ): iris.save(cube, filepath) else: From 94b2b217281f085f906c441d59e335fe7b768875 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 21 Jan 2026 16:19:27 +0000 Subject: [PATCH 28/43] Put encoding information into separate converter class, for use in proxies. --- .../netcdf/_bytecoding_datasets.py | 290 +++++++++--------- .../fileformats/netcdf/_thread_safe_nc.py | 27 +- 2 files changed, 161 insertions(+), 156 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index a3a13f86f5..4559f4b78b 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -42,6 +42,7 @@ import codecs import contextlib +import dataclasses import threading import warnings @@ -80,55 +81,6 @@ def decode_bytesarray_to_stringarray( return result -# -# TODO: remove? -# this older version is "overly flexible", less efficient and not needed here. -# -def flexi_encode_stringarray_as_bytearray( - data: np.ndarray, encoding=None, string_dimension_length: int | None = None -) -> np.ndarray: - """Encode strings as bytearray. - - Note: if 'string_dimension_length' is not given (None), it is set to the longest - encoded bytes element, **OR** the dtype size, if that is greater. - If 'string_dimension_length' is specified, the last array - dimension is set to this and content strings are truncated or extended as required. - """ - if np.ma.isMaskedArray(data): - # netCDF4-python sees zeros as "missing" -- we don't need or want that - data = data.data - element_shape = data.shape - # Encode all the strings + see which is longest - max_length = 1 # this is a MINIMUM - i.e. not zero! - data_elements = np.zeros(element_shape, dtype=object) - for index in np.ndindex(element_shape): - data_element = data[index].encode(encoding=encoding) - element_length = len(data_element) - data_elements[index] = data_element - if element_length > max_length: - max_length = element_length - - if string_dimension_length is None: - # If the string length was not specified, it is the maximum encoded length - # (n-bytes), **or** the dtype string-length, if greater. - string_dimension_length = max_length - array_string_length = int(str(data.dtype)[2:]) # Yuck. No better public way? - if array_string_length > string_dimension_length: - string_dimension_length = array_string_length - - # We maybe *already* encoded all the strings above, but stored them in an - # object-array as we didn't yet know the fixed byte-length to convert to. - # Now convert to a fixed-width byte array with an extra string-length dimension - result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") - right_pad = b"\0" * string_dimension_length - for index in np.ndindex(element_shape): - bytes = data_elements[index] - bytes = (bytes + right_pad)[:string_dimension_length] - result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] - - return result - - def encode_stringarray_as_bytearray( data: np.typing.ArrayLike, encoding: str, string_dimension_length: int ) -> np.ndarray: @@ -158,6 +110,114 @@ def encode_stringarray_as_bytearray( return result +@dataclasses.dataclass +class VariableEncoder: + """A record of encoding details which can apply them to variable data.""" + + varname: str # just for the error messages + dtype: np.dtype + is_chardata: bool # just a shortcut for the dtype test + read_encoding: str # *always* a valid encoding from the codecs package + write_encoding: str # *always* a valid encoding from the codecs package + n_chars_dim: int # length of associated character dimension + string_width: int # string lengths when viewing as strings (i.e. "Uxx") + + def __init__(self, cf_var): + """Get all the info from an netCDF4 variable (or similar wrapper object). + + Most importantly, we do *not* store 'cf_var' : instead we extract the + necessary information and store it in this object. + So, this object has static state + is serialisable. + """ + self.varname = cf_var.name + self.dtype = cf_var.dtype + self.is_chardata = np.issubdtype(self.dtype, np.bytes_) + self.read_encoding = self._get_encoding(cf_var, writing=False) + self.write_encoding = self._get_encoding(cf_var, writing=True) + self.n_chars_dim = cf_var.group().dimensions[cf_var.dimensions[-1]].size + self.string_width = self._get_string_width(cf_var) + + @staticmethod + def _get_encoding(cf_var, writing=False) -> str: + """Get the byte encoding defined for this variable (or None).""" + result = getattr(cf_var, "_Encoding", None) + if result is not None: + try: + # Accept + normalise naming of encodings + result = codecs.lookup(result).name + # NOTE: if encoding does not suit data, errors can occur. + # For example, _Encoding = "ascii", with non-ascii content. + except LookupError: + # Unrecognised encoding name : handle this as just a warning + msg = ( + f"Ignoring unknown encoding for variable {cf_var.name!r}: " + f"_Encoding = {result!r}." + ) + warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning + warnings.warn(msg, category=warntype) + # Proceed as if there is no specified encoding + result = None + + if result is None: + if writing: + result = DEFAULT_WRITE_ENCODING + else: + result = DEFAULT_READ_ENCODING + return result + + def _get_string_width(self, cf_var) -> int: + """Return the string-length defined for this variable.""" + # Work out the actual byte width from the parent dataset dimensions. + strlen = self.n_chars_dim + # Convert the string dimension length (i.e. bytes) to a sufficiently-long + # string width, depending on the (read) encoding used. + encoding = self.read_encoding + if "utf-16" in encoding: + # Each char needs at least 2 bytes -- including a terminator char + strlen = (strlen // 2) - 1 + elif "utf-32" in encoding: + # Each char needs exactly 4 bytes -- including a terminator char + strlen = (strlen // 4) - 1 + # "ELSE": assume there can be (at most) as many chars as bytes + return strlen + + def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray: + if self.is_chardata and DECODE_TO_STRINGS_ON_READ: + # N.B. read encoding default is UTF-8 --> a "usually safe" choice + encoding = self.read_encoding + strlen = self.string_width + try: + data = decode_bytesarray_to_stringarray(data, encoding, strlen) + except UnicodeDecodeError as err: + msg = ( + f"Character data in variable {self.varname!r} could not be decoded " + f"with the {encoding!r} encoding. This can be fixed by setting the " + "variable '_Encoding' attribute to suit the content." + ) + raise ValueError(msg) from err + + return data + + def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray: + if data.dtype.kind == "U": + # N.B. it is also possible to pass a byte array (dtype "S1"), + # to be written directly, without processing. + try: + # N.B. write encoding *default* is "ascii" --> fails bad content + encoding = self.write_encoding + strlen = self.n_chars_dim + data = encode_stringarray_as_bytearray(data, encoding, strlen) + except UnicodeEncodeError as err: + msg = ( + f"String data written to netcdf character variable {self.varname!r} " + f"could not be represented in encoding {self.write_encoding!r}. " + "This can be fixed by setting a suitable variable '_Encoding' " + 'attribute, e.g. ._Encoding="UTF-8".' + ) + raise ValueError(msg) from err + return data + + class NetcdfStringDecodeSetting(threading.local): def __init__(self, perform_encoding: bool = True): self.set(perform_encoding) @@ -184,109 +244,24 @@ def context(self, perform_encoding: bool): class EncodedVariable(VariableWrapper): """A variable wrapper that translates variable data according to byte encodings.""" - def __getitem__(self, keys): - if self._is_chardata(): - # N.B. we never need to UNset this, as we totally control it - self._contained_instance.set_auto_chartostring(False) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def __getitem__(self, keys): + self._contained_instance.set_auto_chartostring(False) data = super().__getitem__(keys) - - if DECODE_TO_STRINGS_ON_READ and self._is_chardata(): - encoding = self._get_encoding() or DEFAULT_READ_ENCODING - # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice - strlen = self._get_string_width() - try: - data = decode_bytesarray_to_stringarray(data, encoding, strlen) - except UnicodeDecodeError as err: - msg = ( - f"Character data in variable {self.name!r} could not be decoded " - f"with the {encoding!r} encoding. This can be fixed by setting the " - "variable '_Encoding' attribute to suit the content." - ) - raise ValueError(msg) from err - + # Create a coding spec : redo every time in case "_Encoding" has changed + encoding_spec = VariableEncoder(self._contained_instance) + data = encoding_spec.decode_bytes_to_stringarray(data) return data def __setitem__(self, keys, data): data = np.asanyarray(data) - if self._is_chardata(): - # N.B. we never need to UNset this, as we totally control it - self._contained_instance.set_auto_chartostring(False) - - # N.B. typically, write encoding default is "ascii" --> fails bad content - if data.dtype.kind == "U": - try: - encoding = ( - self._get_encoding(writing=True) or DEFAULT_WRITE_ENCODING - ) - strlen = self._get_byte_width() - data = encode_stringarray_as_bytearray(data, encoding, strlen) - except UnicodeEncodeError as err: - msg = ( - f"String data written to netcdf character variable {self.name!r} " - f"could not be represented in encoding {encoding!r}. This can be " - "fixed by setting a suitable variable '_Encoding' attribute, " - 'e.g. ._Encoding="UTF-8".' - ) - raise ValueError(msg) from err - + # Create a coding spec : redo every time in case "_Encoding" has changed + encoding_spec = VariableEncoder(self._contained_instance) + data = encoding_spec.encode_strings_as_bytearray(data) super().__setitem__(keys, data) - def _is_chardata(self): - return np.issubdtype(self.dtype, np.bytes_) - - def _get_encoding(self, writing=False) -> str | None: - """Get the byte encoding defined for this variable (or None).""" - result = getattr(self, "_Encoding", None) - if result is not None: - try: - # Accept + normalise naming of encodings - result = codecs.lookup(result).name - # NOTE: if encoding does not suit data, errors can occur. - # For example, _Encoding = "ascii", with non-ascii content. - except LookupError: - # Unrecognised encoding name : handle this as just a warning - msg = ( - f"Ignoring unknown encoding for variable {self.name!r}: " - f"_Encoding = {result!r}." - ) - warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning - warnings.warn(msg, category=warntype) - # Proceed as if there is no specified encoding - result = None - return result - - def _get_byte_width(self) -> int | None: - if not hasattr(self, "_bytewidth"): - n_bytes = self.group().dimensions[self.dimensions[-1]].size - # Cache this length control on the variable -- but not as a netcdf attribute - self.__dict__["_bytewidth"] = n_bytes - - return self.__dict__["_bytewidth"] - - def _get_string_width(self): - """Return the string-length defined for this variable.""" - if not hasattr(self, "_strlen"): - # Work out the actual byte width from the parent dataset dimensions. - strlen = self._get_byte_width() - # Convert the string dimension length (i.e. bytes) to a sufficiently-long - # string width, depending on the encoding used. - encoding = self._get_encoding() or DEFAULT_READ_ENCODING - # regularise the name for comparison with recognised ones - encoding = codecs.lookup(encoding).name - if "utf-16" in encoding: - # Each char needs at least 2 bytes -- including a terminator char - strlen = (strlen // 2) - 1 - elif "utf-32" in encoding: - # Each char needs exactly 4 bytes -- including a terminator char - strlen = (strlen // 4) - 1 - # "ELSE": assume there can be (at most) as many chars as bytes - - # Cache this length control on the variable -- but not as a netcdf attribute - self.__dict__["_strlen"] = strlen - - return self._strlen - def set_auto_chartostring(self, onoff: bool): msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type." raise TypeError(msg) @@ -297,14 +272,37 @@ class EncodedDataset(DatasetWrapper): VAR_WRAPPER_CLS = EncodedVariable + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def set_auto_chartostring(self, onoff: bool): msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type." raise TypeError(msg) class EncodedNetCDFDataProxy(NetCDFDataProxy): - DATASET_CLASS = EncodedDataset + __slots__ = NetCDFDataProxy.__slots__ + ("encoding_details",) + + def __init__(self, cf_var, *args, **kwargs): + # When creating, also capture + record the encoding to be performed. + kwargs["use_byte_data"] = True + super().__init__(cf_var, *args, **kwargs) + self.encoding_details = VariableEncoder(cf_var) + + def __getitem__(self, keys): + data = super().__getitem__(keys) + # Apply the optional bytes-to-strings conversion + data = self.encoding_details.decode_bytes_to_stringarray(data) + return data class EncodedNetCDFWriteProxy(NetCDFWriteProxy): - DATASET_CLASS = EncodedDataset + def __init__(self, filepath, cf_var, file_write_lock): + super.__init__(filepath, cf_var, file_write_lock) + self.encoding_details = VariableEncoder(cf_var) + + def __setitem__(self, key, data): + data = np.asanyarray(data) + # Apply the optional strings-to-bytes conversion + data = self.encoding_details.encode_strings_as_bytearray(data) + super.__setitem__(key, data) diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index cd97452dac..96cee458f7 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -314,15 +314,22 @@ def fromcdl(cls, *args, **kwargs): class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" - __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") - DATASET_CLASS = netCDF4.Dataset - - def __init__(self, shape, dtype, path, variable_name, fill_value): - self.shape = shape + __slots__ = ( + "shape", + "dtype", + "path", + "variable_name", + "fill_value", + "use_byte_data", + ) + + def __init__(self, cf_var, dtype, path, fill_value, *, use_byte_data=False): + self.shape = cf_var.shape + self.variable_name = cf_var.name self.dtype = dtype self.path = path - self.variable_name = variable_name self.fill_value = fill_value + self.use_byte_data = use_byte_data @property def ndim(self): @@ -338,9 +345,11 @@ def __getitem__(self, keys): # netCDF4 library, presumably because __getitem__ gets called so many # times by Dask. Use _GLOBAL_NETCDF4_LOCK directly instead. with _GLOBAL_NETCDF4_LOCK: - dataset = self.DATASET_CLASS(self.path) + dataset = netCDF4.Dataset(self.path) try: variable = dataset.variables[self.variable_name] + if self.use_byte_data: + variable.set_auto_mask(False) # Get the NetCDF variable data and slice. var = variable[keys] finally: @@ -375,8 +384,6 @@ class NetCDFWriteProxy: TODO: could be improved with a caching scheme, but this just about works. """ - DATASET_CLASS = netCDF4.Dataset - def __init__(self, filepath, cf_var, file_write_lock): self.path = filepath self.varname = cf_var.name @@ -404,7 +411,7 @@ def __setitem__(self, keys, array_data): # investigation needed. for attempt in range(5): try: - dataset = self.DATASET_CLASS(self.path, "r+") + dataset = netCDF4.Dataset(self.path, "r+") break except OSError: if attempt < 4: From c4b793604d7128459a2a31e9f233252e547b973e Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 21 Jan 2026 18:41:40 +0000 Subject: [PATCH 29/43] First proper testing (reads working). --- lib/iris/fileformats/cf.py | 67 +++--- .../integration/netcdf/test_stringdata.py | 193 ++++++++++++++++++ 2 files changed, 227 insertions(+), 33 deletions(-) create mode 100644 lib/iris/tests/integration/netcdf/test_stringdata.py diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 6e4b8f99e1..ced409f0c1 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -811,39 +811,40 @@ def cf_label_data(self, cf_data_var): % self.cf_name ) - label_data = self[:] - - if ma.isMaskedArray(label_data): - label_data = label_data.filled(b"\0") - - # Determine whether we have a string-valued scalar label - # i.e. a character variable that only has one dimension (the length of the string). - if self.ndim == 1: - label_string = b"".join(label_data).strip() - label_string = label_string.decode("utf8") - data = np.array([label_string]) - else: - # Determine the index of the string dimension. - str_dim = self.dimensions.index(str_dim_name) - - # Calculate new label data shape (without string dimension) and create payload array. - new_shape = tuple( - dim_len for i, dim_len in enumerate(self.shape) if i != str_dim - ) - string_basetype = "|U%d" - string_dtype = string_basetype % self.shape[str_dim] - data = np.empty(new_shape, dtype=string_dtype) - - for index in np.ndindex(new_shape): - # Create the slice for the label data. - if str_dim == 0: - label_index = (slice(None, None),) + index - else: - label_index = index + (slice(None, None),) - - label_string = b"".join(label_data[label_index]).strip() - label_string = label_string.decode("utf8") - data[index] = label_string + data = self[:] + # label_data = self[:] + # + # if ma.isMaskedArray(label_data): + # label_data = label_data.filled(b"\0") + # + # # Determine whether we have a string-valued scalar label + # # i.e. a character variable that only has one dimension (the length of the string). + # if self.ndim == 1: + # label_string = b"".join(label_data).strip() + # label_string = label_string.decode("utf8") + # data = np.array([label_string]) + # else: + # # Determine the index of the string dimension. + # str_dim = self.dimensions.index(str_dim_name) + # + # # Calculate new label data shape (without string dimension) and create payload array. + # new_shape = tuple( + # dim_len for i, dim_len in enumerate(self.shape) if i != str_dim + # ) + # string_basetype = "|U%d" + # string_dtype = string_basetype % self.shape[str_dim] + # data = np.empty(new_shape, dtype=string_dtype) + # + # for index in np.ndindex(new_shape): + # # Create the slice for the label data. + # if str_dim == 0: + # label_index = (slice(None, None),) + index + # else: + # label_index = index + (slice(None, None),) + # + # label_string = b"".join(label_data[label_index]).strip() + # label_string = label_string.decode("utf8") + # data[index] = label_string return data diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py new file mode 100644 index 0000000000..44c94ac2cc --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -0,0 +1,193 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for various uses of character/string arrays in netcdf file variables. + +This covers both the loading and saving of variables which are the content of +data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures. +""" + +from pathlib import Path + +import numpy as np +import pytest + +import iris +from iris.fileformats.netcdf import _thread_safe_nc + +N_XDIM = 3 +N_CHARS_DIM = 64 +COORD_ON_SEPARATE_DIM = True +PERSIST_TESTFILES = "~/chararray_testfiles" + + +NO_ENCODING_STR = "" +TEST_ENCODINGS = [ + NO_ENCODING_STR, + "ascii", + "utf-8", + # "iso8859-1", # a common one-byte-per-char "codepage" type + # "utf-16", + "utf-32", +] + + +# +# Routines to convert between byte and string arrays. +# Independently defined here, to avoid relying on any code we are testing. +# +def convert_strings_to_chararray( + string_array_1d: np.ndarray, maxlen: int, encoding: str | None = None +): + # Note: this is limited to 1-D arrays of strings. + # Could generalise that if needed, but for now this makes it simpler. + if encoding is None: + encoding = "ascii" + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +def convert_bytearray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +def make_testfile(testfile_path: Path, encoding_str: str): + """Create a test netcdf file. + + Also returns content strings (unicode or ascii versions). + """ + if encoding_str == NO_ENCODING_STR: + encoding = None + else: + encoding = encoding_str + + data_is_ascii = encoding in (None, "ascii") + + if data_is_ascii: + coordvar_strings = ["mOnster", "London", "Amsterdam"] + datavar_strings = ["bun", "Eclair", "sandwich"] + else: + coordvar_strings = ["Münster", "London", "Amsterdam"] + datavar_strings = ["bun", "éclair", "sandwich"] + + coordvar_bytearray = convert_strings_to_chararray( + string_array_1d=coordvar_strings, maxlen=N_CHARS_DIM, encoding=encoding + ) + datavar_bytearray = convert_strings_to_chararray( + string_array_1d=datavar_strings, maxlen=N_CHARS_DIM, encoding=encoding + ) + + ds = _thread_safe_nc.DatasetWrapper(testfile_path, "w") + try: + ds.createDimension("x", N_XDIM) + ds.createDimension("nstr", N_CHARS_DIM) + if COORD_ON_SEPARATE_DIM: + ds.createDimension("nstr2", N_CHARS_DIM) + v_xdim = ds.createVariable("x", int, dimensions=("x")) + v_xdim[:] = np.arange(N_XDIM) + + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2" if COORD_ON_SEPARATE_DIM else "nstr", + ), + ) + v_co[:] = coordvar_bytearray + + if encoding is not None: + v_co._Encoding = encoding + + v_numeric = ds.createVariable( + "v_numeric", + float, + dimensions=("x",), + ) + v_numeric[:] = np.arange(N_XDIM) + + v_datavar = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v_datavar[:] = datavar_bytearray + + if encoding is not None: + v_datavar._Encoding = encoding + + v_datavar.coordinates = "v_co v_numeric" + finally: + ds.close() + + return testfile_path, coordvar_strings, datavar_strings + + +@pytest.fixture(params=TEST_ENCODINGS) +def encoding(request): + return request.param + + +class TestReadEncodings: + """Test loading of testfiles with encoded string data.""" + + @pytest.fixture() + def testdata(self, encoding, tmp_path): + """Create a suitable valid testfile, and return expected string content.""" + if PERSIST_TESTFILES: + tmp_path = Path(PERSIST_TESTFILES).expanduser() + if encoding == "": + filetag = "noencoding" + else: + filetag = encoding + tempfile_path = tmp_path / f"sample_read_{filetag}.nc" + testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding) + from iris.tests.integration.netcdf.test_chararrays import ncdump + + ncdump(tempfile_path) + yield testdata + + def assert_no_load_problems(self): + if len(iris.loading.LOAD_PROBLEMS.problems): + probs = "\n".join(str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems) + assert probs == "" + + def test_valid_encodings(self, encoding, testdata): + testfile_path, coordvar_strings, datavar_strings = testdata + cube = iris.load_cube(testfile_path) + self.assert_no_load_problems() + assert cube.shape == (N_XDIM,) + + if encoding != "utf-32": + expected_string_width = N_CHARS_DIM + else: + expected_string_width = (N_CHARS_DIM // 4) - 1 + assert cube.dtype == f" Date: Fri, 23 Jan 2026 15:41:17 +0000 Subject: [PATCH 30/43] Encoded reading ~working; new ideas for switching (untested). --- .../fileformats/_nc_load_rules/helpers.py | 8 +-- lib/iris/fileformats/cf.py | 58 ++++++++++--------- .../netcdf/_bytecoding_datasets.py | 42 +++++++++++++- .../fileformats/netcdf/_thread_safe_nc.py | 2 +- lib/iris/fileformats/netcdf/loader.py | 36 ++++++++---- .../integration/netcdf/test_stringdata.py | 52 +++++++++++++---- 6 files changed, 142 insertions(+), 56 deletions(-) diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index fa63002f09..a2800dc91d 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -1644,11 +1644,11 @@ def _add_auxiliary_coordinate( # Determine the name of the dimension/s shared between the CF-netCDF data variable # and the coordinate being built. coord_dims = cf_coord_var.dimensions - if cf._is_str_dtype(cf_coord_var): - coord_dims = coord_dims[:-1] + # if cf._is_str_dtype(cf_coord_var): + # coord_dims = coord_dims[:-1] datavar_dims = engine.cf_var.dimensions - if cf._is_str_dtype(engine.cf_var): - datavar_dims = datavar_dims[:-1] + # if cf._is_str_dtype(engine.cf_var): + # datavar_dims = datavar_dims[:-1] common_dims = [dim for dim in coord_dims if dim in datavar_dims] data_dims = None if common_dims: diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index ced409f0c1..d32afaacb5 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -26,7 +26,7 @@ import iris.exceptions import iris.fileformats._nc_load_rules.helpers as hh -from iris.fileformats.netcdf import _bytecoding_datasets +from iris.fileformats.netcdf import _bytecoding_datasets, _thread_safe_nc from iris.mesh.components import Connectivity import iris.util import iris.warnings @@ -67,7 +67,9 @@ # NetCDF returns a different type for strings depending on Python version. def _is_str_dtype(var): - return np.issubdtype(var.dtype, np.bytes_) + # N.B. use 'datatype' not 'dtype', to "look inside" variable wrappers which + # represent 'S1' type data as 'U'. + return isinstance(var.datatype, np.dtype) and np.issubdtype(var.datatype, np.bytes_) ################################################################################ @@ -788,28 +790,28 @@ def cf_label_data(self, cf_data_var): % type(cf_data_var) ) - # Determine the name of the label string (or length) dimension by - # finding the dimension name that doesn't exist within the data dimensions. - str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions)) - n_nondata_dims = len(str_dim_names) - - if n_nondata_dims == 0: - # *All* dims are shared with the data-variable. - # This is only ok if the data-var is *also* a string type. - dim_ok = _is_str_dtype(cf_data_var) - # In this case, we must just *assume* that the last dimension is "the" - # string dimension - str_dim_name = self.dimensions[-1] - else: - # If there is exactly one non-data dim, that is the one we want - dim_ok = len(str_dim_names) == 1 - (str_dim_name,) = str_dim_names - - if not dim_ok: - raise ValueError( - "Invalid string dimensions for CF-netCDF label variable %r" - % self.cf_name - ) + # # Determine the name of the label string (or length) dimension by + # # finding the dimension name that doesn't exist within the data dimensions. + # str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions)) + # n_nondata_dims = len(str_dim_names) + # + # if n_nondata_dims == 0: + # # *All* dims are shared with the data-variable. + # # This is only ok if the data-var is *also* a string type. + # dim_ok = _is_str_dtype(cf_data_var) + # # In this case, we must just *assume* that the last dimension is "the" + # # string dimension + # str_dim_name = self.dimensions[-1] + # else: + # # If there is exactly one non-data dim, that is the one we want + # dim_ok = len(str_dim_names) == 1 + # (str_dim_name,) = str_dim_names + # + # if not dim_ok: + # raise ValueError( + # "Invalid string dimensions for CF-netCDF label variable %r" + # % self.cf_name + # ) data = self[:] # label_data = self[:] @@ -1374,9 +1376,11 @@ def __init__(self, file_source, warn=False, monotonic=False): if isinstance(file_source, str): # Create from filepath : open it + own it (=close when we die). self._filename = os.path.expanduser(file_source) - self._dataset = _bytecoding_datasets.EncodedDataset( - self._filename, mode="r" - ) + if _bytecoding_datasets.DECODE_TO_STRINGS_ON_READ: + ds_type = _bytecoding_datasets.EncodedDataset + else: + ds_type = _thread_safe_nc.DatasetWrapper + self._dataset = ds_type(self._filename, mode="r") self._own_file = True else: # We have been passed an open dataset. diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 4559f4b78b..fa64e570bb 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -182,7 +182,7 @@ def _get_string_width(self, cf_var) -> int: return strlen def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray: - if self.is_chardata and DECODE_TO_STRINGS_ON_READ: + if self.is_chardata: # N.B. read encoding default is UTF-8 --> a "usually safe" choice encoding = self.read_encoding strlen = self.string_width @@ -247,6 +247,38 @@ class EncodedVariable(VariableWrapper): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # Override specific properties of the contained instance, making changes in the case + # that the variable contains char data, which is presented instead as strings + # with one less dimension. + + @property + def shape(self): + shape = self._contained_instance.shape + is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_) + if is_chardata: + # Translated char data appears without the final dimension + shape = shape[:-1] # remove final dimension + return shape + + @property + def dimensions(self): + dimensions = self._contained_instance.dimensions + is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_) + if is_chardata: + # Translated char data appears without the final dimension + dimensions = dimensions[:-1] # remove final dimension + return dimensions + + @property + def dtype(self): + dtype = self._contained_instance.dtype + is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_) + if is_chardata: + # Create a coding spec : redo every time in case "_Encoding" has changed + encoding_spec = VariableEncoder(self._contained_instance) + dtype = np.dtype(f"U{encoding_spec.string_width}") + return dtype + def __getitem__(self, keys): self._contained_instance.set_auto_chartostring(False) data = super().__getitem__(keys) @@ -287,7 +319,13 @@ def __init__(self, cf_var, *args, **kwargs): # When creating, also capture + record the encoding to be performed. kwargs["use_byte_data"] = True super().__init__(cf_var, *args, **kwargs) - self.encoding_details = VariableEncoder(cf_var) + if not isinstance(cf_var, EncodedVariable): + msg = ( + f"Unexpected variable type : {type(cf_var)} of variable '{cf_var.name}'" + ": expected EncodedVariable." + ) + raise TypeError(msg) + self.encoding_details = VariableEncoder(cf_var._contained_instance) def __getitem__(self, keys): data = super().__getitem__(keys) diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 96cee458f7..f96312cf79 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -349,7 +349,7 @@ def __getitem__(self, keys): try: variable = dataset.variables[self.variable_name] if self.use_byte_data: - variable.set_auto_mask(False) + variable.set_auto_chartostring(False) # Get the NetCDF variable data and slice. var = variable[keys] finally: diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index d363e29738..9607b393d9 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -50,6 +50,10 @@ # An expected part of the public loader API, but includes thread safety # concerns so is housed in _thread_safe_nc. +# NOTE: this is the *default*, as required for public legacy api +# - in practice, when creating our proxies we dynamically choose between this and +# :class:`_thread_safe_nc.DatasetWrapper`, depending on +# :data:`_bytecoding_datasets.DECODE_TO_STRINGS_ON_READ` NetCDFDataProxy = _bytecoding_datasets.EncodedNetCDFDataProxy @@ -279,7 +283,7 @@ def _get_cf_var_data(cf_var): # correct dtype. Note: this is not an issue for masked arrays, # only masked scalar values. if result is np.ma.masked: - result = np.ma.masked_all(1, dtype=cf_var.datatype) + result = np.ma.masked_all(1, dtype=cf_var.dtype) else: # Get lazy chunked data out of a cf variable. # Creates Dask wrappers around data arrays for any cube components which @@ -289,15 +293,27 @@ def _get_cf_var_data(cf_var): # Make a data-proxy that mimics array access and can fetch from the file. # Note: Special handling needed for "variable length string" types which # return a dtype of `str`, rather than a numpy type; use `S1` in this case. - fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:] - fill_value = getattr( - cf_var.cf_data, - "_FillValue", - _thread_safe_nc.default_fillvals[fill_dtype], - ) - proxy = NetCDFDataProxy( - cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value - ) + if cf_var.dtype.kind == "U": + # Special handling for "string variables". + fill_value = "" + else: + fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:] + fill_value = getattr( + cf_var.cf_data, + "_FillValue", + _thread_safe_nc.default_fillvals[fill_dtype], + ) + + # Switch type of proxy, based on type of variable. + # It is done this way, instead of using an instance variable, because the + # limited nature of the wrappers makes a stateful choice awkward, + # e.g. especially, "variable.group()" is *not* the parent DatasetWrapper. + if isinstance(cf_var.cf_data, _bytecoding_datasets.EncodedVariable): + proxy_class = _bytecoding_datasets.EncodedNetCDFDataProxy + else: + proxy_class = _thread_safe_nc.NetCDFDataProxy + + proxy = proxy_class(cf_var.cf_data, dtype, cf_var.filename, fill_value) # Get the chunking specified for the variable : this is either a shape, or # maybe the string "contiguous". if CHUNK_CONTROL.mode is ChunkControl.Modes.AS_DASK: diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 44c94ac2cc..5831f85b41 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -8,6 +8,7 @@ data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures. """ +from dataclasses import dataclass from pathlib import Path import numpy as np @@ -16,9 +17,12 @@ import iris from iris.fileformats.netcdf import _thread_safe_nc +iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = 0 + N_XDIM = 3 N_CHARS_DIM = 64 -COORD_ON_SEPARATE_DIM = True +# COORD_ON_SEPARATE_DIM = True +COORD_ON_SEPARATE_DIM = False PERSIST_TESTFILES = "~/chararray_testfiles" @@ -72,6 +76,14 @@ def convert_bytearray_to_strings( return result +@dataclass +class SamplefileDetails: + filepath: Path + datavar_data: np.ndarray + stringcoord_data: np.ndarray + numericcoord_data: np.ndarray + + def make_testfile(testfile_path: Path, encoding_str: str): """Create a test netcdf file. @@ -84,6 +96,7 @@ def make_testfile(testfile_path: Path, encoding_str: str): data_is_ascii = encoding in (None, "ascii") + numeric_values = np.arange(3.0) if data_is_ascii: coordvar_strings = ["mOnster", "London", "Amsterdam"] datavar_strings = ["bun", "Eclair", "sandwich"] @@ -125,7 +138,7 @@ def make_testfile(testfile_path: Path, encoding_str: str): float, dimensions=("x",), ) - v_numeric[:] = np.arange(N_XDIM) + v_numeric[:] = numeric_values v_datavar = ds.createVariable( "v", @@ -144,7 +157,12 @@ def make_testfile(testfile_path: Path, encoding_str: str): finally: ds.close() - return testfile_path, coordvar_strings, datavar_strings + return SamplefileDetails( + filepath=testfile_path, + datavar_data=datavar_strings, + stringcoord_data=coordvar_strings, + numericcoord_data=numeric_values, + ) @pytest.fixture(params=TEST_ENCODINGS) @@ -152,6 +170,10 @@ def encoding(request): return request.param +def load_problems_list(): + return [str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems] + + class TestReadEncodings: """Test loading of testfiles with encoded string data.""" @@ -168,18 +190,19 @@ def testdata(self, encoding, tmp_path): testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding) from iris.tests.integration.netcdf.test_chararrays import ncdump + # TODO: temporary for debug -- TO REMOVE ncdump(tempfile_path) yield testdata - def assert_no_load_problems(self): - if len(iris.loading.LOAD_PROBLEMS.problems): - probs = "\n".join(str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems) - assert probs == "" - - def test_valid_encodings(self, encoding, testdata): - testfile_path, coordvar_strings, datavar_strings = testdata + def test_valid_encodings(self, encoding, testdata: SamplefileDetails): + testfile_path, datavar_strings, coordvar_strings, numeric_data = ( + testdata.filepath, + testdata.datavar_data, + testdata.stringcoord_data, + testdata.numericcoord_data, + ) cube = iris.load_cube(testfile_path) - self.assert_no_load_problems() + assert load_problems_list() == [] assert cube.shape == (N_XDIM,) if encoding != "utf-32": @@ -187,7 +210,12 @@ def test_valid_encodings(self, encoding, testdata): else: expected_string_width = (N_CHARS_DIM // 4) - 1 assert cube.dtype == f" Date: Tue, 27 Jan 2026 11:37:10 +0000 Subject: [PATCH 31/43] Check loads when coords do/not share a string dim with data. --- .../integration/netcdf/test_stringdata.py | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 5831f85b41..fc5bf5ae3e 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -8,6 +8,7 @@ data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures. """ +from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path @@ -17,12 +18,18 @@ import iris from iris.fileformats.netcdf import _thread_safe_nc -iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = 0 + +@pytest.fixture(scope="module") +def all_lazy_auxcoords(): + """Ensure that *all* aux-coords are loaded lazily, even really small ones.""" + old_minlazybytes = iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES + iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = 0 + yield + iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = old_minlazybytes + N_XDIM = 3 N_CHARS_DIM = 64 -# COORD_ON_SEPARATE_DIM = True -COORD_ON_SEPARATE_DIM = False PERSIST_TESTFILES = "~/chararray_testfiles" @@ -78,16 +85,22 @@ def convert_bytearray_to_strings( @dataclass class SamplefileDetails: + """Convenience container for information about a sample file.""" + filepath: Path datavar_data: np.ndarray stringcoord_data: np.ndarray numericcoord_data: np.ndarray -def make_testfile(testfile_path: Path, encoding_str: str): +def make_testfile( + testfile_path: Path, + encoding_str: str, + coords_on_separate_dim: bool, +) -> SamplefileDetails: """Create a test netcdf file. - Also returns content strings (unicode or ascii versions). + Also returns content information for checking loaded results. """ if encoding_str == NO_ENCODING_STR: encoding = None @@ -115,7 +128,7 @@ def make_testfile(testfile_path: Path, encoding_str: str): try: ds.createDimension("x", N_XDIM) ds.createDimension("nstr", N_CHARS_DIM) - if COORD_ON_SEPARATE_DIM: + if coords_on_separate_dim: ds.createDimension("nstr2", N_CHARS_DIM) v_xdim = ds.createVariable("x", int, dimensions=("x")) v_xdim[:] = np.arange(N_XDIM) @@ -125,7 +138,7 @@ def make_testfile(testfile_path: Path, encoding_str: str): "S1", dimensions=( "x", - "nstr2" if COORD_ON_SEPARATE_DIM else "nstr", + "nstr2" if coords_on_separate_dim else "nstr", ), ) v_co[:] = coordvar_bytearray @@ -177,8 +190,17 @@ def load_problems_list(): class TestReadEncodings: """Test loading of testfiles with encoded string data.""" + @pytest.fixture(params=["coordsSameDim", "coordsOwnDim"]) + def use_separate_dims(self, request): + yield request.param == "coordsOwnDim" + @pytest.fixture() - def testdata(self, encoding, tmp_path): + def testdata( + self, + encoding, + tmp_path, + use_separate_dims, + ): """Create a suitable valid testfile, and return expected string content.""" if PERSIST_TESTFILES: tmp_path = Path(PERSIST_TESTFILES).expanduser() @@ -186,8 +208,13 @@ def testdata(self, encoding, tmp_path): filetag = "noencoding" else: filetag = encoding - tempfile_path = tmp_path / f"sample_read_{filetag}.nc" - testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding) + dimtag = "diffdims" if use_separate_dims else "samedims" + tempfile_path = tmp_path / f"sample_read_{filetag}_{dimtag}.nc" + testdata = make_testfile( + testfile_path=tempfile_path, + encoding_str=encoding, + coords_on_separate_dim=use_separate_dims, + ) from iris.tests.integration.netcdf.test_chararrays import ncdump # TODO: temporary for debug -- TO REMOVE From 9bdeb5d5012fa19a4eb5ee50a782694163db84cf Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Tue, 27 Jan 2026 15:32:36 +0000 Subject: [PATCH 32/43] Fix nondecoded reference loads in test_byecoded_datasets. --- .../netcdf/test_bytecoding_datasets.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 4909d976de..f16097bef3 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -304,6 +304,14 @@ class TestRead: def readmode(self, request): return request.param + def undecoded_testvar(self, ds_encoded, varname: str): + path = ds_encoded.filepath() + ds_encoded.close() + ds = DatasetWrapper(path) + v = ds.variables[varname] + v.set_auto_chartostring(False) + return v + def test_encodings(self, encoding, tempdir, readmode): # Create a dataset with the variable path = tempdir / f"test_read_encodings_{encoding!s}_{readmode}.nc" @@ -337,9 +345,9 @@ def test_encodings(self, encoding, tempdir, readmode): assert np.all(truncated_result == result) result = truncated_result else: - # Test "raw" read --> byte array - with DECODE_TO_STRINGS_ON_READ.context(False): - result = v[:] + # Close and re-open as "regular" dataset -- just to check the raw content + v = self.undecoded_testvar(ds_encoded, "vxs") + result = v[:] expected = write_bytes check_array_matching(result, expected) @@ -364,8 +372,8 @@ def test_scalar(self, tempdir, readmode): expected = np.array(data_string) else: # Test "raw" read --> byte array - with DECODE_TO_STRINGS_ON_READ.context(False): - result = v[:] + v = self.undecoded_testvar(ds_encoded, "v0_scalar") + result = v[:] expected = data_bytes check_array_matching(result, expected) @@ -401,8 +409,8 @@ def test_multidim(self, tempdir, readmode): expected = np.array(test_strings) else: # Test "raw" read --> byte array - with DECODE_TO_STRINGS_ON_READ.context(False): - result = v[:] + v = self.undecoded_testvar(ds_encoded, "vyxn") + result = v[:] expected = test_bytes check_array_matching(result, expected) @@ -410,8 +418,8 @@ def test_multidim(self, tempdir, readmode): def test_read_encoding_failure(self, tempdir, readmode): path = tempdir / f"test_read_encoding_failure_{readmode}.nc" strlen = 10 - ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii") - v = ds.variables["vxs"] + ds_encoded = make_encoded_dataset(path, strlen=strlen, encoding="ascii") + v = ds_encoded.variables["vxs"] test_utf8_bytes = make_bytearray( samples_3_nonascii, bytewidth=strlen, encoding="utf-8" ) @@ -425,8 +433,8 @@ def test_read_encoding_failure(self, tempdir, readmode): with pytest.raises(ValueError, match=msg): v[:] else: - with DECODE_TO_STRINGS_ON_READ.context(False): - result = v[:] # this ought to be ok! + v = self.undecoded_testvar(ds_encoded, "vxs") + result = v[:] # this ought to be ok! assert np.all(result == test_utf8_bytes) From 54d7743a5d1cfc1b12e07865c127bc1407b9b4db Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Tue, 27 Jan 2026 18:49:23 +0000 Subject: [PATCH 33/43] Test writing of string data: various encodings, from strings or bytes. --- .../integration/netcdf/test_stringdata.py | 165 ++++++++++++++++-- 1 file changed, 155 insertions(+), 10 deletions(-) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index fc5bf5ae3e..ed6fa576df 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -8,14 +8,17 @@ data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures. """ -from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path +from typing import Iterable import numpy as np +from numpy.typing import ArrayLike import pytest import iris +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube from iris.fileformats.netcdf import _thread_safe_nc @@ -49,8 +52,8 @@ def all_lazy_auxcoords(): # Independently defined here, to avoid relying on any code we are testing. # def convert_strings_to_chararray( - string_array_1d: np.ndarray, maxlen: int, encoding: str | None = None -): + string_array_1d: ArrayLike, maxlen: int, encoding: str | None = None +) -> np.ndarray: # Note: this is limited to 1-D arrays of strings. # Could generalise that if needed, but for now this makes it simpler. if encoding is None: @@ -63,12 +66,13 @@ def convert_strings_to_chararray( def convert_bytearray_to_strings( - byte_array, encoding="utf-8", string_length: int | None = None -): + byte_array: ArrayLike, encoding: str = "utf-8", string_length: int | None = None +) -> np.ndarray: """Convert bytes to strings. N.B. for now at least, we assume the string dim is **always the last one**. """ + byte_array = np.asanyarray(byte_array) bytes_shape = byte_array.shape var_shape = bytes_shape[:-1] if string_length is None: @@ -88,9 +92,9 @@ class SamplefileDetails: """Convenience container for information about a sample file.""" filepath: Path - datavar_data: np.ndarray - stringcoord_data: np.ndarray - numericcoord_data: np.ndarray + datavar_data: ArrayLike + stringcoord_data: ArrayLike + numericcoord_data: ArrayLike def make_testfile( @@ -200,7 +204,7 @@ def testdata( encoding, tmp_path, use_separate_dims, - ): + ) -> Iterable[SamplefileDetails]: """Create a suitable valid testfile, and return expected string content.""" if PERSIST_TESTFILES: tmp_path = Path(PERSIST_TESTFILES).expanduser() @@ -218,7 +222,7 @@ def testdata( from iris.tests.integration.netcdf.test_chararrays import ncdump # TODO: temporary for debug -- TO REMOVE - ncdump(tempfile_path) + ncdump(str(tempfile_path)) yield testdata def test_valid_encodings(self, encoding, testdata: SamplefileDetails): @@ -246,3 +250,144 @@ def test_valid_encodings(self, encoding, testdata: SamplefileDetails): coord_var_2 = cube.coord("v_numeric") assert coord_var_2.dtype == np.float64 assert np.all(coord_var_2.points == numeric_data) + + +@pytest.fixture(params=["stringdata", "bytedata"]) +def as_bytes(request): + yield request.param == "bytedata" + + +@dataclass +class SampleCubeDetails: + cube: Cube + datavar_data: np.ndarray + stringcoord_data: np.ndarray + save_path: str | Path | None = None + + +def make_testcube( + encoding_str: str | None = None, + byte_data: bool = False, +) -> SampleCubeDetails: + data_is_ascii = encoding_str in (NO_ENCODING_STR, "ascii") + + numeric_values = np.arange(3.0) + if data_is_ascii: + coordvar_strings = ["mOnster", "London", "Amsterdam"] + datavar_strings = ["bun", "Eclair", "sandwich"] + else: + coordvar_strings = ["Münster", "London", "Amsterdam"] + datavar_strings = ["bun", "éclair", "sandwich"] + + if not byte_data: + charlen = N_CHARS_DIM + if encoding_str == "utf-32": + charlen = charlen // 4 - 1 + strings_dtype = np.dtype(f"U{charlen}") + coordvar_array = np.array(coordvar_strings, dtype=strings_dtype) + datavar_array = np.array(datavar_strings, dtype=strings_dtype) + else: + write_encoding = encoding_str + if write_encoding == NO_ENCODING_STR: + write_encoding = "ascii" + coordvar_array = convert_strings_to_chararray( + coordvar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding + ) + datavar_array = convert_strings_to_chararray( + datavar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding + ) + + cube = Cube(datavar_array, var_name="v") + cube.add_dim_coord(DimCoord(np.arange(N_XDIM), var_name="x"), 0) + if encoding_str != NO_ENCODING_STR: + cube.attributes["_Encoding"] = encoding_str + co_x = AuxCoord(coordvar_array, var_name="v_co") + if encoding_str != NO_ENCODING_STR: + co_x.attributes["_Encoding"] = encoding_str + co_dims = (0, 1) if byte_data else (0,) + cube.add_aux_coord(co_x, co_dims) + + result = SampleCubeDetails( + cube=cube, + datavar_data=datavar_array, + stringcoord_data=coordvar_array, + ) + return result + + +class TestWriteEncodings: + """Test saving of testfiles with encoded string data. + + To avoid circularity, we generate and save *cube* data. + """ + + @pytest.fixture(params=["dataAsStrings", "dataAsBytes"]) + def write_bytes(self, request): + yield request.param == "dataAsBytes" + + @pytest.fixture() + def testpath(self, encoding, write_bytes, tmp_path): + """Create a suitable test cube, with either string or byte content.""" + if PERSIST_TESTFILES: + tmp_path = Path(PERSIST_TESTFILES).expanduser() + if encoding == "": + filetag = "noencoding" + else: + filetag = encoding + datatag = "writebytes" if write_bytes else "writestrings" + tempfile_path = tmp_path / f"sample_write_{filetag}_{datatag}.nc" + yield tempfile_path + + @pytest.fixture() + def testdata(self, testpath, encoding, write_bytes): + """Create a suitable test cube + save to a file. + + Apply the given encoding to both coord and cube data. + Form the data as bytes, or as strings, depending on 'write_bytes'.' + """ + cube_info = make_testcube(encoding_str=encoding, byte_data=write_bytes) + cube_info.save_path = testpath + cube = cube_info.cube + iris.save(cube, testpath) + yield cube_info + + def test_valid_encodings(self, encoding, testdata, write_bytes): + cube_info = testdata + cube, path = cube_info.cube, cube_info.save_path + # TODO: not testing the "byte read/write" yet + # Make a quick check for cube equality : but the presentation depends on the read mode + # with DECODE_TO_STRINGS_ON_READ.context(not write_bytes): + # read_cube = iris.load_cube(path) + # assert read_cube == cube + + # N.B. file content should not depend on whether bytes or strings were written + vararray, coordarray = cube_info.datavar_data, cube_info.stringcoord_data + ds = _thread_safe_nc.DatasetWrapper(path) + ds.set_auto_chartostring(False) + v_main = ds.variables["v"] + v_co = ds.variables["v_co"] + assert v_main.shape == (N_XDIM, N_CHARS_DIM) + assert v_co.shape == (N_XDIM, N_CHARS_DIM) + assert v_main.dtype == " Date: Wed, 28 Jan 2026 14:39:59 +0000 Subject: [PATCH 34/43] Fix write proxy; tmp_path in stringdata tests; tidy stringdata tests. --- .../netcdf/_bytecoding_datasets.py | 4 +- .../integration/netcdf/test_stringdata.py | 54 ++++++++++++------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index fa64e570bb..59ad639634 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -336,11 +336,11 @@ def __getitem__(self, keys): class EncodedNetCDFWriteProxy(NetCDFWriteProxy): def __init__(self, filepath, cf_var, file_write_lock): - super.__init__(filepath, cf_var, file_write_lock) + super().__init__(filepath, cf_var, file_write_lock) self.encoding_details = VariableEncoder(cf_var) def __setitem__(self, key, data): data = np.asanyarray(data) # Apply the optional strings-to-bytes conversion data = self.encoding_details.encode_strings_as_bytearray(data) - super.__setitem__(key, data) + super().__setitem__(key, data) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index ed6fa576df..bc308d474c 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -33,8 +33,9 @@ def all_lazy_auxcoords(): N_XDIM = 3 N_CHARS_DIM = 64 -PERSIST_TESTFILES = "~/chararray_testfiles" - +# TODO: remove (debug) +# PERSIST_TESTFILES: str | None = "~/chararray_testfiles" +PERSIST_TESTFILES: str | None = None NO_ENCODING_STR = "" TEST_ENCODINGS = [ @@ -199,38 +200,51 @@ def use_separate_dims(self, request): yield request.param == "coordsOwnDim" @pytest.fixture() - def testdata( + def readtest_path( self, encoding, tmp_path, use_separate_dims, ) -> Iterable[SamplefileDetails]: """Create a suitable valid testfile, and return expected string content.""" - if PERSIST_TESTFILES: - tmp_path = Path(PERSIST_TESTFILES).expanduser() + match PERSIST_TESTFILES: + case str(): + tmp_path = Path(PERSIST_TESTFILES).expanduser() + case _: + pass if encoding == "": filetag = "noencoding" else: filetag = encoding dimtag = "diffdims" if use_separate_dims else "samedims" tempfile_path = tmp_path / f"sample_read_{filetag}_{dimtag}.nc" + yield tempfile_path + + @pytest.fixture() + def readtest_data( + self, + encoding, + readtest_path, + use_separate_dims, + ) -> Iterable[SamplefileDetails]: + """Create a suitable valid testfile, and return expected string content.""" testdata = make_testfile( - testfile_path=tempfile_path, + testfile_path=readtest_path, encoding_str=encoding, coords_on_separate_dim=use_separate_dims, ) - from iris.tests.integration.netcdf.test_chararrays import ncdump - # TODO: temporary for debug -- TO REMOVE - ncdump(str(tempfile_path)) + # # TODO: temporary for debug -- TO REMOVE + # from iris.tests.integration.netcdf.test_chararrays import ncdump + # ncdump(str(tempfile_path)) yield testdata - def test_valid_encodings(self, encoding, testdata: SamplefileDetails): + def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails): testfile_path, datavar_strings, coordvar_strings, numeric_data = ( - testdata.filepath, - testdata.datavar_data, - testdata.stringcoord_data, - testdata.numericcoord_data, + readtest_data.filepath, + readtest_data.datavar_data, + readtest_data.stringcoord_data, + readtest_data.numericcoord_data, ) cube = iris.load_cube(testfile_path) assert load_problems_list() == [] @@ -326,7 +340,7 @@ def write_bytes(self, request): yield request.param == "dataAsBytes" @pytest.fixture() - def testpath(self, encoding, write_bytes, tmp_path): + def writetest_path(self, encoding, write_bytes, tmp_path): """Create a suitable test cube, with either string or byte content.""" if PERSIST_TESTFILES: tmp_path = Path(PERSIST_TESTFILES).expanduser() @@ -339,20 +353,20 @@ def testpath(self, encoding, write_bytes, tmp_path): yield tempfile_path @pytest.fixture() - def testdata(self, testpath, encoding, write_bytes): + def writetest_data(self, writetest_path, encoding, write_bytes): """Create a suitable test cube + save to a file. Apply the given encoding to both coord and cube data. Form the data as bytes, or as strings, depending on 'write_bytes'.' """ cube_info = make_testcube(encoding_str=encoding, byte_data=write_bytes) - cube_info.save_path = testpath + cube_info.save_path = writetest_path cube = cube_info.cube - iris.save(cube, testpath) + iris.save(cube, writetest_path) yield cube_info - def test_valid_encodings(self, encoding, testdata, write_bytes): - cube_info = testdata + def test_valid_encodings(self, encoding, writetest_data, write_bytes): + cube_info = writetest_data cube, path = cube_info.cube, cube_info.save_path # TODO: not testing the "byte read/write" yet # Make a quick check for cube equality : but the presentation depends on the read mode From cf9594b2110a1fdae4f3462e18399b899ef191b4 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 28 Jan 2026 14:58:42 +0000 Subject: [PATCH 35/43] Fix for non-string data. --- lib/iris/fileformats/netcdf/_bytecoding_datasets.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 59ad639634..22a9011eec 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -132,10 +132,11 @@ def __init__(self, cf_var): self.varname = cf_var.name self.dtype = cf_var.dtype self.is_chardata = np.issubdtype(self.dtype, np.bytes_) - self.read_encoding = self._get_encoding(cf_var, writing=False) - self.write_encoding = self._get_encoding(cf_var, writing=True) - self.n_chars_dim = cf_var.group().dimensions[cf_var.dimensions[-1]].size - self.string_width = self._get_string_width(cf_var) + if self.is_chardata: + self.read_encoding = self._get_encoding(cf_var, writing=False) + self.write_encoding = self._get_encoding(cf_var, writing=True) + self.n_chars_dim = cf_var.group().dimensions[cf_var.dimensions[-1]].size + self.string_width = self._get_string_width(cf_var) @staticmethod def _get_encoding(cf_var, writing=False) -> str: @@ -199,7 +200,7 @@ def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray: return data def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray: - if data.dtype.kind == "U": + if self.is_chardata and data.dtype.kind == "U": # N.B. it is also possible to pass a byte array (dtype "S1"), # to be written directly, without processing. try: From ef11375e99ccfff6bf780cdef732a98e2a3ffc2a Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 28 Jan 2026 15:15:29 +0000 Subject: [PATCH 36/43] Pre-clear load problems. --- lib/iris/tests/integration/netcdf/test_stringdata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index bc308d474c..5050152042 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -195,6 +195,11 @@ def load_problems_list(): class TestReadEncodings: """Test loading of testfiles with encoded string data.""" + @pytest.fixture(autouse=True) + def _clear_load_problems(self): + iris.loading.LOAD_PROBLEMS.reset() + yield + @pytest.fixture(params=["coordsSameDim", "coordsOwnDim"]) def use_separate_dims(self, request): yield request.param == "coordsOwnDim" From 2dbdcba0688036e3462962b2bbaf2c50bfa5b2dc Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 27 Feb 2026 16:46:23 +0000 Subject: [PATCH 37/43] Fix mock patches. --- .../fileformats/netcdf/saver/test_Saver.py | 55 ++++++++++++------- .../saver/test_Saver__lazy_stream_data.py | 2 +- .../netcdf/saver/test_Saver__ugrid.py | 8 +-- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py index 0eb12d794c..374cb4815e 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py +++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py @@ -35,7 +35,8 @@ ) from iris.coords import AncillaryVariable, AuxCoord, DimCoord from iris.cube import Cube -from iris.fileformats.netcdf import Saver, _thread_safe_nc +from iris.fileformats.netcdf import Saver +from iris.fileformats.netcdf import _bytecoding_datasets as ds_wrappers from iris.tests._shared_utils import assert_CDL import iris.tests.stock as stock @@ -219,7 +220,7 @@ def test_big_endian(self): def test_zlib(self): cube = self._simple_cube(">f4") - api = self.patch("iris.fileformats.netcdf.saver._thread_safe_nc") + api = self.patch("iris.fileformats.netcdf.saver.bytecoding_datasets") # Define mocked default fill values to prevent deprecation warning (#4374). api.default_fillvals = collections.defaultdict(lambda: -99.0) # Mock the apparent dtype of mocked variables, to avoid an error. @@ -230,7 +231,7 @@ def test_zlib(self): # a fill-value report on a non-compliant variable in a non-file (!) with Saver("/dummy/path", "NETCDF4", compute=False) as saver: saver.write(cube, zlib=True) - dataset = api.DatasetWrapper.return_value + dataset = api.EncodedDataset.return_value create_var_call = mock.call( "air_pressure_anomaly", np.dtype("float32"), @@ -270,8 +271,12 @@ def test_compression(self): with self.temp_filename(suffix=".nc") as nc_path: with Saver(nc_path, "NETCDF4", compute=False) as saver: + tgt = ( + "iris.fileformats.netcdf.saver.bytecoding_datasets" + ".EncodedDataset.createVariable" + ) createvar_spy = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + tgt, # Use 'wraps' to allow the patched methods to function as normal # - the patch object just acts as a 'spy' on its calls. wraps=saver._dataset.createVariable, @@ -306,8 +311,12 @@ def test_non_compression__shape(self): with self.temp_filename(suffix=".nc") as nc_path: with Saver(nc_path, "NETCDF4", compute=False) as saver: + tgt = ( + "iris.fileformats.netcdf.saver.bytecoding_datasets" + ".EncodedDataset.createVariable" + ) createvar_spy = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + tgt, # Use 'wraps' to allow the patched methods to function as normal # - the patch object just acts as a 'spy' on its calls. wraps=saver._dataset.createVariable, @@ -342,8 +351,12 @@ def test_non_compression__dtype(self): with self.temp_filename(suffix=".nc") as nc_path: with Saver(nc_path, "NETCDF4", compute=False) as saver: + tgt = ( + "iris.fileformats.netcdf.saver.bytecoding_datasets" + ".EncodedDataset.createVariable" + ) createvar_spy = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + tgt, # Use 'wraps' to allow the patched methods to function as normal # - the patch object just acts as a 'spy' on its calls. wraps=saver._dataset.createVariable, @@ -382,7 +395,7 @@ def test_default_unlimited_dimensions(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) self.assertFalse(ds.dimensions["dim0"].isunlimited()) self.assertFalse(ds.dimensions["dim1"].isunlimited()) ds.close() @@ -392,7 +405,7 @@ def test_no_unlimited_dimensions(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=None) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) for dim in ds.dimensions.values(): self.assertFalse(dim.isunlimited()) ds.close() @@ -414,7 +427,7 @@ def test_custom_unlimited_dimensions(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=unlimited_dimensions) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) for dim in unlimited_dimensions: self.assertTrue(ds.dimensions[dim].isunlimited()) ds.close() @@ -423,7 +436,7 @@ def test_custom_unlimited_dimensions(self): coords = [cube.coord(dim) for dim in unlimited_dimensions] with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=coords) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) for dim in unlimited_dimensions: self.assertTrue(ds.dimensions[dim].isunlimited()) ds.close() @@ -434,7 +447,7 @@ def test_reserved_attributes(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) res = ds.getncattr("dimensions") ds.close() self.assertEqual(res, "something something_else") @@ -456,7 +469,7 @@ def test_dimensional_to_scalar(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) # Confirm that the only dimension is the one denoting the number # of bounds - have successfully saved the 2D bounds array into 1D. self.assertEqual(["bnds"], list(ds.dimensions.keys())) @@ -496,7 +509,7 @@ def _check_bounds_setting(self, climatological=False): saver._ensure_valid_dtype.return_value = mock.Mock( shape=coord.bounds.shape, dtype=coord.bounds.dtype ) - var = mock.MagicMock(spec=_thread_safe_nc.VariableWrapper) + var = mock.MagicMock(spec=ds_wrappers.EncodedVariable) # Make the main call. Saver._create_cf_bounds(saver, coord, var, "time") @@ -537,7 +550,7 @@ def test_valid_range_saved(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) self.assertArrayEqual(ds.valid_range, vrange) ds.close() @@ -549,7 +562,7 @@ def test_valid_min_saved(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) self.assertArrayEqual(ds.valid_min, 1) ds.close() @@ -561,7 +574,7 @@ def test_valid_max_saved(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) self.assertArrayEqual(ds.valid_max, 2) ds.close() @@ -581,7 +594,7 @@ def test_valid_range_saved(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) self.assertArrayEqual(ds.variables["longitude"].valid_range, vrange) ds.close() @@ -593,7 +606,7 @@ def test_valid_min_saved(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) self.assertArrayEqual(ds.variables["longitude"].valid_min, 1) ds.close() @@ -605,7 +618,7 @@ def test_valid_max_saved(self): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, unlimited_dimensions=[]) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) self.assertArrayEqual(ds.variables["longitude"].valid_max, 2) ds.close() @@ -637,7 +650,7 @@ def _netCDF_var(self, cube, **kwargs): with self.temp_filename(".nc") as nc_path: with Saver(nc_path, "NETCDF4") as saver: saver.write(cube, **kwargs) - ds = _thread_safe_nc.DatasetWrapper(nc_path) + ds = ds_wrappers.EncodedDataset(nc_path) (var,) = [ var for var in ds.variables.values() @@ -714,7 +727,7 @@ def setUp(self): ) ) patch = mock.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", dataset_class, ) _ = patch.start() diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py index 7c884e4c22..3b76dca13b 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py +++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py @@ -30,7 +30,7 @@ def saver_patch(): mock_dataset = mock.MagicMock() mock_dataset_class = mock.Mock(return_value=mock_dataset) # Mock the wrapper within the netcdf saver - target1 = "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper" + target1 = "iris.fileformats.netcdf.saver.bytecoding_datasets.DatasetWrapper" # Mock the real netCDF4.Dataset within the threadsafe-nc module, as this is # used by NetCDFDataProxy and NetCDFWriteProxy. target2 = "iris.fileformats.netcdf._thread_safe_nc.netCDF4.Dataset" diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py index 9494eabebf..571237512d 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py +++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py @@ -401,12 +401,12 @@ def test_compression(self): # into the iris.fileformats.netcdf.saver. Also we want to check that the # compression kwargs are passed into the NetCDF4 createVariable method patch = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.createVariable", ) # No need to patch this NetCDF4 variable to compensate for the previous patch # on createVariable, which doesn't actually create the variable. self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.variables" + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.variables" ) cube = make_cube(var_name=(var_name := "a")) compression_kwargs = { @@ -785,10 +785,10 @@ def test_compression(self): """ patch = self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable", + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.createVariable", ) self.patch( - "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.variables" + "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.variables" ) mesh = make_mesh() compression_kwargs = { From a34ea09d635eda36ba0dd63f170cc966d85b887f Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 27 Feb 2026 16:57:49 +0000 Subject: [PATCH 38/43] Fix patches in test_CFReader. --- lib/iris/tests/unit/fileformats/cf/test_CFReader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/cf/test_CFReader.py b/lib/iris/tests/unit/fileformats/cf/test_CFReader.py index 7f37eb9f24..522d157fb1 100644 --- a/lib/iris/tests/unit/fileformats/cf/test_CFReader.py +++ b/lib/iris/tests/unit/fileformats/cf/test_CFReader.py @@ -78,7 +78,7 @@ def _setup(self, mocker): getncattr=getncattr, ) mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=dataset, ) @@ -141,7 +141,7 @@ def _setup(self, mocker): mocker.patch("iris.fileformats.cf.CFReader._build_cf_groups") mocker.patch("iris.fileformats.cf.CFReader._reset") mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=self.dataset, ) @@ -237,7 +237,7 @@ def _setup(self, mocker): # and building first level cf-groups for variables. mocker.patch("iris.fileformats.cf.CFReader._reset") mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=self.dataset, ) @@ -373,7 +373,7 @@ def _setup_class(self, mocker): # translations and building first level cf-groups for variables. mocker.patch("iris.fileformats.cf.CFReader._reset") mocker.patch( - "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper", + "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset", return_value=self.dataset, ) cf_reader = CFReader("dummy") From aa1fe03ebb060a699fb68a3460b20bbeda5e42ce Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 27 Feb 2026 17:41:22 +0000 Subject: [PATCH 39/43] Fix variable creation in odd cases. --- lib/iris/fileformats/netcdf/saver.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index d43df538c2..3d9f9a91a2 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1713,11 +1713,14 @@ def add_names_attrs(): if element.units.calendar: _setncattr(cf_var, "calendar", str(element.units.calendar)) - # Most attributes are dealt with later. - # But _Encoding need to be defined before we can write to a character variable - if element.dtype.kind in "SU" and "_Encoding" in element.attributes: - encoding = element.attributes.pop("_Encoding") - _setncattr(cf_var, "_Encoding", encoding) + # Note: when writing UGRID, "element" can be a Mesh which has no "dtype", + # and for dataless cubes it will have a 'None' dtype. + if getattr(element, "dtype", None) is not None: + # Most attributes are dealt with later. But _Encoding needs to be defined + # *before* we can write to a character variable. + if element.dtype.kind in "SU" and "_Encoding" in element.attributes: + encoding = element.attributes.pop("_Encoding") + _setncattr(cf_var, "_Encoding", encoding) if not isinstance(element, Cube): # Add any other custom coordinate attributes. From f5d50ee4a21c186e0e89c059f3c19b18da98a514 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 27 Feb 2026 18:05:43 +0000 Subject: [PATCH 40/43] Ignore attribute reordering in scaling-packed saves. --- .../multi_packed_multi_dtype.cdl | 68 ------------------ .../multi_packed_single_dtype.cdl | 70 ------------------- .../TestPackedData/single_packed_manual.cdl | 50 ------------- .../TestPackedData/single_packed_signed.cdl | 50 ------------- .../TestPackedData/single_packed_unsigned.cdl | 50 ------------- 5 files changed, 288 deletions(-) delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl deleted file mode 100644 index 8a8f481492..0000000000 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl +++ /dev/null @@ -1,68 +0,0 @@ -dimensions: - bnds = 2 ; - latitude = 73 ; - longitude = 96 ; - time = 360 ; -variables: - short air_temperature(time, latitude, longitude) ; - air_temperature:scale_factor = 0.00242575f ; - air_temperature:add_offset = 261.648f ; - air_temperature:standard_name = "air_temperature" ; - air_temperature:units = "K" ; - air_temperature:um_stash_source = "m01s03i236" ; - air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ; - air_temperature:grid_mapping = "latitude_longitude" ; - air_temperature:coordinates = "forecast_period forecast_reference_time height" ; - int latitude_longitude ; - latitude_longitude:grid_mapping_name = "latitude_longitude" ; - latitude_longitude:longitude_of_prime_meridian = 0. ; - latitude_longitude:earth_radius = 6371229. ; - double time(time) ; - time:axis = "T" ; - time:bounds = "time_bnds" ; - time:units = "hours since 1970-01-01 00:00:00" ; - time:standard_name = "time" ; - time:calendar = "360_day" ; - double time_bnds(time, bnds) ; - float latitude(latitude) ; - latitude:axis = "Y" ; - latitude:units = "degrees_north" ; - latitude:standard_name = "latitude" ; - float longitude(longitude) ; - longitude:axis = "X" ; - longitude:units = "degrees_east" ; - longitude:standard_name = "longitude" ; - double forecast_period(time) ; - forecast_period:bounds = "forecast_period_bnds" ; - forecast_period:units = "hours" ; - forecast_period:standard_name = "forecast_period" ; - double forecast_period_bnds(time, bnds) ; - double forecast_reference_time ; - forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; - forecast_reference_time:standard_name = "forecast_reference_time" ; - forecast_reference_time:calendar = "360_day" ; - double height ; - height:units = "m" ; - height:standard_name = "height" ; - height:positive = "up" ; - float precipitation_flux(time, latitude, longitude) ; - precipitation_flux:standard_name = "precipitation_flux" ; - precipitation_flux:units = "kg m-2 s-1" ; - precipitation_flux:um_stash_source = "m01s05i216" ; - precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ; - precipitation_flux:grid_mapping = "latitude_longitude" ; - precipitation_flux:coordinates = "forecast_period forecast_reference_time" ; - ushort air_temperature_0(time, latitude, longitude) ; - air_temperature_0:scale_factor = 0.002014167f ; - air_temperature_0:add_offset = 176.7872f ; - air_temperature_0:standard_name = "air_temperature" ; - air_temperature_0:units = "K" ; - air_temperature_0:um_stash_source = "m01s03i236" ; - air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ; - air_temperature_0:grid_mapping = "latitude_longitude" ; - air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ; - -// global attributes: - :source = "Data from Met Office Unified Model" ; - :Conventions = "CF-1.7" ; -} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl deleted file mode 100644 index 3f2c909ce8..0000000000 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl +++ /dev/null @@ -1,70 +0,0 @@ -dimensions: - bnds = 2 ; - latitude = 73 ; - longitude = 96 ; - time = 360 ; -variables: - short air_temperature(time, latitude, longitude) ; - air_temperature:scale_factor = 0.00242575f ; - air_temperature:add_offset = 261.648f ; - air_temperature:standard_name = "air_temperature" ; - air_temperature:units = "K" ; - air_temperature:um_stash_source = "m01s03i236" ; - air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ; - air_temperature:grid_mapping = "latitude_longitude" ; - air_temperature:coordinates = "forecast_period forecast_reference_time height" ; - int latitude_longitude ; - latitude_longitude:grid_mapping_name = "latitude_longitude" ; - latitude_longitude:longitude_of_prime_meridian = 0. ; - latitude_longitude:earth_radius = 6371229. ; - double time(time) ; - time:axis = "T" ; - time:bounds = "time_bnds" ; - time:units = "hours since 1970-01-01 00:00:00" ; - time:standard_name = "time" ; - time:calendar = "360_day" ; - double time_bnds(time, bnds) ; - float latitude(latitude) ; - latitude:axis = "Y" ; - latitude:units = "degrees_north" ; - latitude:standard_name = "latitude" ; - float longitude(longitude) ; - longitude:axis = "X" ; - longitude:units = "degrees_east" ; - longitude:standard_name = "longitude" ; - double forecast_period(time) ; - forecast_period:bounds = "forecast_period_bnds" ; - forecast_period:units = "hours" ; - forecast_period:standard_name = "forecast_period" ; - double forecast_period_bnds(time, bnds) ; - double forecast_reference_time ; - forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; - forecast_reference_time:standard_name = "forecast_reference_time" ; - forecast_reference_time:calendar = "360_day" ; - double height ; - height:units = "m" ; - height:standard_name = "height" ; - height:positive = "up" ; - short precipitation_flux(time, latitude, longitude) ; - precipitation_flux:scale_factor = 2.989738e-08f ; - precipitation_flux:add_offset = 0.0009796774f ; - precipitation_flux:standard_name = "precipitation_flux" ; - precipitation_flux:units = "kg m-2 s-1" ; - precipitation_flux:um_stash_source = "m01s05i216" ; - precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ; - precipitation_flux:grid_mapping = "latitude_longitude" ; - precipitation_flux:coordinates = "forecast_period forecast_reference_time" ; - short air_temperature_0(time, latitude, longitude) ; - air_temperature_0:scale_factor = 0.002014167f ; - air_temperature_0:add_offset = 242.7874f ; - air_temperature_0:standard_name = "air_temperature" ; - air_temperature_0:units = "K" ; - air_temperature_0:um_stash_source = "m01s03i236" ; - air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ; - air_temperature_0:grid_mapping = "latitude_longitude" ; - air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ; - -// global attributes: - :source = "Data from Met Office Unified Model" ; - :Conventions = "CF-1.7" ; -} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl deleted file mode 100644 index 83e7329575..0000000000 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl +++ /dev/null @@ -1,50 +0,0 @@ -dimensions: - bnds = 2 ; - latitude = 73 ; - longitude = 96 ; -variables: - short air_temperature(latitude, longitude) ; - air_temperature:scale_factor = 0.001198068f ; - air_temperature:add_offset = 267.4006f ; - air_temperature:standard_name = "air_temperature" ; - air_temperature:units = "K" ; - air_temperature:um_stash_source = "m01s03i236" ; - air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; - air_temperature:grid_mapping = "latitude_longitude" ; - air_temperature:coordinates = "forecast_period forecast_reference_time height time" ; - int latitude_longitude ; - latitude_longitude:grid_mapping_name = "latitude_longitude" ; - latitude_longitude:longitude_of_prime_meridian = 0. ; - latitude_longitude:earth_radius = 6371229. ; - float latitude(latitude) ; - latitude:axis = "Y" ; - latitude:units = "degrees_north" ; - latitude:standard_name = "latitude" ; - float longitude(longitude) ; - longitude:axis = "X" ; - longitude:units = "degrees_east" ; - longitude:standard_name = "longitude" ; - double forecast_period ; - forecast_period:bounds = "forecast_period_bnds" ; - forecast_period:units = "hours" ; - forecast_period:standard_name = "forecast_period" ; - double forecast_period_bnds(bnds) ; - double forecast_reference_time ; - forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; - forecast_reference_time:standard_name = "forecast_reference_time" ; - forecast_reference_time:calendar = "standard" ; - double height ; - height:units = "m" ; - height:standard_name = "height" ; - height:positive = "up" ; - double time ; - time:bounds = "time_bnds" ; - time:units = "hours since 1970-01-01 00:00:00" ; - time:standard_name = "time" ; - time:calendar = "standard" ; - double time_bnds(bnds) ; - -// global attributes: - :source = "Data from Met Office Unified Model" ; - :Conventions = "CF-1.7" ; -} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl deleted file mode 100644 index 83e7329575..0000000000 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl +++ /dev/null @@ -1,50 +0,0 @@ -dimensions: - bnds = 2 ; - latitude = 73 ; - longitude = 96 ; -variables: - short air_temperature(latitude, longitude) ; - air_temperature:scale_factor = 0.001198068f ; - air_temperature:add_offset = 267.4006f ; - air_temperature:standard_name = "air_temperature" ; - air_temperature:units = "K" ; - air_temperature:um_stash_source = "m01s03i236" ; - air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; - air_temperature:grid_mapping = "latitude_longitude" ; - air_temperature:coordinates = "forecast_period forecast_reference_time height time" ; - int latitude_longitude ; - latitude_longitude:grid_mapping_name = "latitude_longitude" ; - latitude_longitude:longitude_of_prime_meridian = 0. ; - latitude_longitude:earth_radius = 6371229. ; - float latitude(latitude) ; - latitude:axis = "Y" ; - latitude:units = "degrees_north" ; - latitude:standard_name = "latitude" ; - float longitude(longitude) ; - longitude:axis = "X" ; - longitude:units = "degrees_east" ; - longitude:standard_name = "longitude" ; - double forecast_period ; - forecast_period:bounds = "forecast_period_bnds" ; - forecast_period:units = "hours" ; - forecast_period:standard_name = "forecast_period" ; - double forecast_period_bnds(bnds) ; - double forecast_reference_time ; - forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; - forecast_reference_time:standard_name = "forecast_reference_time" ; - forecast_reference_time:calendar = "standard" ; - double height ; - height:units = "m" ; - height:standard_name = "height" ; - height:positive = "up" ; - double time ; - time:bounds = "time_bnds" ; - time:units = "hours since 1970-01-01 00:00:00" ; - time:standard_name = "time" ; - time:calendar = "standard" ; - double time_bnds(bnds) ; - -// global attributes: - :source = "Data from Met Office Unified Model" ; - :Conventions = "CF-1.7" ; -} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl deleted file mode 100644 index 7b9114309e..0000000000 --- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl +++ /dev/null @@ -1,50 +0,0 @@ -dimensions: - bnds = 2 ; - latitude = 73 ; - longitude = 96 ; -variables: - ubyte air_temperature(latitude, longitude) ; - air_temperature:scale_factor = 0.3079035f ; - air_temperature:add_offset = 228.1423f ; - air_temperature:standard_name = "air_temperature" ; - air_temperature:units = "K" ; - air_temperature:um_stash_source = "m01s03i236" ; - air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; - air_temperature:grid_mapping = "latitude_longitude" ; - air_temperature:coordinates = "forecast_period forecast_reference_time height time" ; - int latitude_longitude ; - latitude_longitude:grid_mapping_name = "latitude_longitude" ; - latitude_longitude:longitude_of_prime_meridian = 0. ; - latitude_longitude:earth_radius = 6371229. ; - float latitude(latitude) ; - latitude:axis = "Y" ; - latitude:units = "degrees_north" ; - latitude:standard_name = "latitude" ; - float longitude(longitude) ; - longitude:axis = "X" ; - longitude:units = "degrees_east" ; - longitude:standard_name = "longitude" ; - double forecast_period ; - forecast_period:bounds = "forecast_period_bnds" ; - forecast_period:units = "hours" ; - forecast_period:standard_name = "forecast_period" ; - double forecast_period_bnds(bnds) ; - double forecast_reference_time ; - forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; - forecast_reference_time:standard_name = "forecast_reference_time" ; - forecast_reference_time:calendar = "standard" ; - double height ; - height:units = "m" ; - height:standard_name = "height" ; - height:positive = "up" ; - double time ; - time:bounds = "time_bnds" ; - time:units = "hours since 1970-01-01 00:00:00" ; - time:standard_name = "time" ; - time:calendar = "standard" ; - double time_bnds(bnds) ; - -// global attributes: - :source = "Data from Met Office Unified Model" ; - :Conventions = "CF-1.7" ; -} From b2c6d51d8003ae8f8a34885b0caa5f0f30baf5ec Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 27 Feb 2026 18:35:47 +0000 Subject: [PATCH 41/43] Fix test for refactored proxy constructor. --- .../helpers/test_build_and_add_auxiliary_coordinate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py index 5ed3413409..94540d4ab9 100644 --- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py +++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py @@ -171,7 +171,7 @@ class TestDtype(tests.IrisTest): def setUp(self): # Create coordinate cf variables and pyke engine. points = np.arange(6).reshape(2, 3) - cf_data = mock.MagicMock(_FillValue=None) + cf_data = mock.MagicMock(_FillValue=None, shape=points.shape) cf_data.chunking = mock.MagicMock(return_value=points.shape) self.engine = mock.Mock( From dfd4d918f3f083e17c197bc210e44e669a9d94b3 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 27 Feb 2026 18:56:41 +0000 Subject: [PATCH 42/43] Fix get_cf_var_data to support vlen-string. --- lib/iris/fileformats/netcdf/loader.py | 2 +- .../fileformats/netcdf/loader/test__get_cf_var_data.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 9607b393d9..32eea77db8 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -293,7 +293,7 @@ def _get_cf_var_data(cf_var): # Make a data-proxy that mimics array access and can fetch from the file. # Note: Special handling needed for "variable length string" types which # return a dtype of `str`, rather than a numpy type; use `S1` in this case. - if cf_var.dtype.kind == "U": + if getattr(cf_var.dtype, "kind", None) == "U": # Special handling for "string variables". fill_value = "" else: diff --git a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py index e29f0de012..876ce65f25 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py +++ b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py @@ -26,14 +26,15 @@ def setUp(self): self.expected_chunks = _optimum_chunksize(self.shape, self.shape) def _make(self, chunksizes=None, shape=None, dtype="i4", **extra_properties): + if shape is None: + shape = self.shape cf_data = mock.MagicMock( _FillValue=None, __getitem__="", - dimensions=["dim_" + str(x) for x in range(len(shape or "1"))], + dimensions=["dim_" + str(x) for x in range(len(shape))], + shape=shape, ) cf_data.chunking = mock.MagicMock(return_value=chunksizes) - if shape is None: - shape = self.shape if dtype is not str: # for testing VLen str arrays (dtype=`class `) dtype = np.dtype(dtype) cf_var = mock.MagicMock( From 274fae4014835c25b748e49d6a8ff1c880723802 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 27 Feb 2026 18:58:04 +0000 Subject: [PATCH 43/43] Add back new test results, folder removed in error. --- .../multi_packed_multi_dtype.cdl | 68 ++++++++++++++++++ .../multi_packed_single_dtype.cdl | 70 +++++++++++++++++++ .../TestPackedData/single_packed_manual.cdl | 50 +++++++++++++ .../TestPackedData/single_packed_signed.cdl | 50 +++++++++++++ .../TestPackedData/single_packed_unsigned.cdl | 50 +++++++++++++ 5 files changed, 288 insertions(+) create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl new file mode 100644 index 0000000000..27d8f55a45 --- /dev/null +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl @@ -0,0 +1,68 @@ +dimensions: + bnds = 2 ; + latitude = 73 ; + longitude = 96 ; + time = 360 ; +variables: + short air_temperature(time, latitude, longitude) ; + air_temperature:standard_name = "air_temperature" ; + air_temperature:units = "K" ; + air_temperature:scale_factor = 0.00242575f ; + air_temperature:add_offset = 261.648f ; + air_temperature:um_stash_source = "m01s03i236" ; + air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ; + air_temperature:grid_mapping = "latitude_longitude" ; + air_temperature:coordinates = "forecast_period forecast_reference_time height" ; + int latitude_longitude ; + latitude_longitude:grid_mapping_name = "latitude_longitude" ; + latitude_longitude:longitude_of_prime_meridian = 0. ; + latitude_longitude:earth_radius = 6371229. ; + double time(time) ; + time:axis = "T" ; + time:bounds = "time_bnds" ; + time:units = "hours since 1970-01-01 00:00:00" ; + time:standard_name = "time" ; + time:calendar = "360_day" ; + double time_bnds(time, bnds) ; + float latitude(latitude) ; + latitude:axis = "Y" ; + latitude:units = "degrees_north" ; + latitude:standard_name = "latitude" ; + float longitude(longitude) ; + longitude:axis = "X" ; + longitude:units = "degrees_east" ; + longitude:standard_name = "longitude" ; + double forecast_period(time) ; + forecast_period:bounds = "forecast_period_bnds" ; + forecast_period:units = "hours" ; + forecast_period:standard_name = "forecast_period" ; + double forecast_period_bnds(time, bnds) ; + double forecast_reference_time ; + forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; + forecast_reference_time:standard_name = "forecast_reference_time" ; + forecast_reference_time:calendar = "360_day" ; + double height ; + height:units = "m" ; + height:standard_name = "height" ; + height:positive = "up" ; + float precipitation_flux(time, latitude, longitude) ; + precipitation_flux:standard_name = "precipitation_flux" ; + precipitation_flux:units = "kg m-2 s-1" ; + precipitation_flux:um_stash_source = "m01s05i216" ; + precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ; + precipitation_flux:grid_mapping = "latitude_longitude" ; + precipitation_flux:coordinates = "forecast_period forecast_reference_time" ; + ushort air_temperature_0(time, latitude, longitude) ; + air_temperature_0:standard_name = "air_temperature" ; + air_temperature_0:units = "K" ; + air_temperature_0:scale_factor = 0.002014167f ; + air_temperature_0:add_offset = 176.7872f ; + air_temperature_0:um_stash_source = "m01s03i236" ; + air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ; + air_temperature_0:grid_mapping = "latitude_longitude" ; + air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ; + +// global attributes: + :source = "Data from Met Office Unified Model" ; + :Conventions = "CF-1.7" ; +} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl new file mode 100644 index 0000000000..c85fd35efd --- /dev/null +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl @@ -0,0 +1,70 @@ +dimensions: + bnds = 2 ; + latitude = 73 ; + longitude = 96 ; + time = 360 ; +variables: + short air_temperature(time, latitude, longitude) ; + air_temperature:standard_name = "air_temperature" ; + air_temperature:units = "K" ; + air_temperature:scale_factor = 0.00242575f ; + air_temperature:add_offset = 261.648f ; + air_temperature:um_stash_source = "m01s03i236" ; + air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ; + air_temperature:grid_mapping = "latitude_longitude" ; + air_temperature:coordinates = "forecast_period forecast_reference_time height" ; + int latitude_longitude ; + latitude_longitude:grid_mapping_name = "latitude_longitude" ; + latitude_longitude:longitude_of_prime_meridian = 0. ; + latitude_longitude:earth_radius = 6371229. ; + double time(time) ; + time:axis = "T" ; + time:bounds = "time_bnds" ; + time:units = "hours since 1970-01-01 00:00:00" ; + time:standard_name = "time" ; + time:calendar = "360_day" ; + double time_bnds(time, bnds) ; + float latitude(latitude) ; + latitude:axis = "Y" ; + latitude:units = "degrees_north" ; + latitude:standard_name = "latitude" ; + float longitude(longitude) ; + longitude:axis = "X" ; + longitude:units = "degrees_east" ; + longitude:standard_name = "longitude" ; + double forecast_period(time) ; + forecast_period:bounds = "forecast_period_bnds" ; + forecast_period:units = "hours" ; + forecast_period:standard_name = "forecast_period" ; + double forecast_period_bnds(time, bnds) ; + double forecast_reference_time ; + forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; + forecast_reference_time:standard_name = "forecast_reference_time" ; + forecast_reference_time:calendar = "360_day" ; + double height ; + height:units = "m" ; + height:standard_name = "height" ; + height:positive = "up" ; + short precipitation_flux(time, latitude, longitude) ; + precipitation_flux:standard_name = "precipitation_flux" ; + precipitation_flux:units = "kg m-2 s-1" ; + precipitation_flux:scale_factor = 2.989738e-08f ; + precipitation_flux:add_offset = 0.0009796774f ; + precipitation_flux:um_stash_source = "m01s05i216" ; + precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ; + precipitation_flux:grid_mapping = "latitude_longitude" ; + precipitation_flux:coordinates = "forecast_period forecast_reference_time" ; + short air_temperature_0(time, latitude, longitude) ; + air_temperature_0:standard_name = "air_temperature" ; + air_temperature_0:units = "K" ; + air_temperature_0:scale_factor = 0.002014167f ; + air_temperature_0:add_offset = 242.7874f ; + air_temperature_0:um_stash_source = "m01s03i236" ; + air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ; + air_temperature_0:grid_mapping = "latitude_longitude" ; + air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ; + +// global attributes: + :source = "Data from Met Office Unified Model" ; + :Conventions = "CF-1.7" ; +} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl new file mode 100644 index 0000000000..ed89a25d9f --- /dev/null +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl @@ -0,0 +1,50 @@ +dimensions: + bnds = 2 ; + latitude = 73 ; + longitude = 96 ; +variables: + short air_temperature(latitude, longitude) ; + air_temperature:standard_name = "air_temperature" ; + air_temperature:units = "K" ; + air_temperature:scale_factor = 0.001198068f ; + air_temperature:add_offset = 267.4006f ; + air_temperature:um_stash_source = "m01s03i236" ; + air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; + air_temperature:grid_mapping = "latitude_longitude" ; + air_temperature:coordinates = "forecast_period forecast_reference_time height time" ; + int latitude_longitude ; + latitude_longitude:grid_mapping_name = "latitude_longitude" ; + latitude_longitude:longitude_of_prime_meridian = 0. ; + latitude_longitude:earth_radius = 6371229. ; + float latitude(latitude) ; + latitude:axis = "Y" ; + latitude:units = "degrees_north" ; + latitude:standard_name = "latitude" ; + float longitude(longitude) ; + longitude:axis = "X" ; + longitude:units = "degrees_east" ; + longitude:standard_name = "longitude" ; + double forecast_period ; + forecast_period:bounds = "forecast_period_bnds" ; + forecast_period:units = "hours" ; + forecast_period:standard_name = "forecast_period" ; + double forecast_period_bnds(bnds) ; + double forecast_reference_time ; + forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; + forecast_reference_time:standard_name = "forecast_reference_time" ; + forecast_reference_time:calendar = "standard" ; + double height ; + height:units = "m" ; + height:standard_name = "height" ; + height:positive = "up" ; + double time ; + time:bounds = "time_bnds" ; + time:units = "hours since 1970-01-01 00:00:00" ; + time:standard_name = "time" ; + time:calendar = "standard" ; + double time_bnds(bnds) ; + +// global attributes: + :source = "Data from Met Office Unified Model" ; + :Conventions = "CF-1.7" ; +} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl new file mode 100644 index 0000000000..ed89a25d9f --- /dev/null +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl @@ -0,0 +1,50 @@ +dimensions: + bnds = 2 ; + latitude = 73 ; + longitude = 96 ; +variables: + short air_temperature(latitude, longitude) ; + air_temperature:standard_name = "air_temperature" ; + air_temperature:units = "K" ; + air_temperature:scale_factor = 0.001198068f ; + air_temperature:add_offset = 267.4006f ; + air_temperature:um_stash_source = "m01s03i236" ; + air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; + air_temperature:grid_mapping = "latitude_longitude" ; + air_temperature:coordinates = "forecast_period forecast_reference_time height time" ; + int latitude_longitude ; + latitude_longitude:grid_mapping_name = "latitude_longitude" ; + latitude_longitude:longitude_of_prime_meridian = 0. ; + latitude_longitude:earth_radius = 6371229. ; + float latitude(latitude) ; + latitude:axis = "Y" ; + latitude:units = "degrees_north" ; + latitude:standard_name = "latitude" ; + float longitude(longitude) ; + longitude:axis = "X" ; + longitude:units = "degrees_east" ; + longitude:standard_name = "longitude" ; + double forecast_period ; + forecast_period:bounds = "forecast_period_bnds" ; + forecast_period:units = "hours" ; + forecast_period:standard_name = "forecast_period" ; + double forecast_period_bnds(bnds) ; + double forecast_reference_time ; + forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; + forecast_reference_time:standard_name = "forecast_reference_time" ; + forecast_reference_time:calendar = "standard" ; + double height ; + height:units = "m" ; + height:standard_name = "height" ; + height:positive = "up" ; + double time ; + time:bounds = "time_bnds" ; + time:units = "hours since 1970-01-01 00:00:00" ; + time:standard_name = "time" ; + time:calendar = "standard" ; + double time_bnds(bnds) ; + +// global attributes: + :source = "Data from Met Office Unified Model" ; + :Conventions = "CF-1.7" ; +} diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl new file mode 100644 index 0000000000..eedad33e03 --- /dev/null +++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl @@ -0,0 +1,50 @@ +dimensions: + bnds = 2 ; + latitude = 73 ; + longitude = 96 ; +variables: + ubyte air_temperature(latitude, longitude) ; + air_temperature:standard_name = "air_temperature" ; + air_temperature:units = "K" ; + air_temperature:scale_factor = 0.3079035f ; + air_temperature:add_offset = 228.1423f ; + air_temperature:um_stash_source = "m01s03i236" ; + air_temperature:cell_methods = "time: mean (interval: 6 hour)" ; + air_temperature:grid_mapping = "latitude_longitude" ; + air_temperature:coordinates = "forecast_period forecast_reference_time height time" ; + int latitude_longitude ; + latitude_longitude:grid_mapping_name = "latitude_longitude" ; + latitude_longitude:longitude_of_prime_meridian = 0. ; + latitude_longitude:earth_radius = 6371229. ; + float latitude(latitude) ; + latitude:axis = "Y" ; + latitude:units = "degrees_north" ; + latitude:standard_name = "latitude" ; + float longitude(longitude) ; + longitude:axis = "X" ; + longitude:units = "degrees_east" ; + longitude:standard_name = "longitude" ; + double forecast_period ; + forecast_period:bounds = "forecast_period_bnds" ; + forecast_period:units = "hours" ; + forecast_period:standard_name = "forecast_period" ; + double forecast_period_bnds(bnds) ; + double forecast_reference_time ; + forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ; + forecast_reference_time:standard_name = "forecast_reference_time" ; + forecast_reference_time:calendar = "standard" ; + double height ; + height:units = "m" ; + height:standard_name = "height" ; + height:positive = "up" ; + double time ; + time:bounds = "time_bnds" ; + time:units = "hours since 1970-01-01 00:00:00" ; + time:standard_name = "time" ; + time:calendar = "standard" ; + double time_bnds(bnds) ; + +// global attributes: + :source = "Data from Met Office Unified Model" ; + :Conventions = "CF-1.7" ; +}