From 041af2d092b0a79587146d002d823e3fea156a91 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Sat, 25 Oct 2025 00:18:03 +0100
Subject: [PATCH 01/43] Initial tests.

---
 lib/iris/fileformats/cf.py                    |   2 +-
 .../integration/netcdf/test_chararrays.py     | 112 ++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 lib/iris/tests/integration/netcdf/test_chararrays.py

diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
index 2b6568c315..b65ab70792 100644
--- a/lib/iris/fileformats/cf.py
+++ b/lib/iris/fileformats/cf.py
@@ -802,7 +802,7 @@ def cf_label_data(self, cf_data_var):
         label_data = self[:]
 
         if ma.isMaskedArray(label_data):
-            label_data = label_data.filled()
+            label_data = label_data.filled(b"\0")
 
         # Determine whether we have a string-valued scalar label
         # i.e. a character variable that only has one dimension (the length of the string).
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
new file mode 100644
index 0000000000..feb93047dd
--- /dev/null
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -0,0 +1,112 @@
+import netCDF4 as nc
+import numpy as np
+import pytest
+
+import iris
+
+NX, N_STRLEN = 3, 64
+TEST_STRINGS = ["Münster", "London", "Amsterdam"]
+TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
+
+
+def convert_chararray(string_array_1d, maxlen, encoding="utf-8"):
+    bbytes = [text.encode(encoding) for text in string_array_1d]
+    pad = b"\0" * maxlen
+    bbytes = [(x + pad)[:maxlen] for x in bbytes]
+    chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
+    return chararray
+
+
+INCLUDE_COORD = True
+# INCLUDE_COORD = False
+
+
+def make_testfile(filepath, chararray, coordarray, encoding_str=None):
+    with nc.Dataset(filepath, "w") as ds:
+        ds.createDimension("x", NX)
+        ds.createDimension("nstr", N_STRLEN)
+        vx = ds.createVariable("x", int, dimensions=("x"))
+        vx[:] = np.arange(NX)
+        if INCLUDE_COORD:
+            ds.createDimension("nstr2", N_STRLEN)
+            v_co = ds.createVariable(
+                "v_co",
+                "S1",
+                dimensions=(
+                    "x",
+                    "nstr2",
+                ),
+            )
+            v_co[:] = coordarray
+            if encoding_str is not None:
+                v_co._Encoding = encoding_str
+        v = ds.createVariable(
+            "v",
+            "S1",
+            dimensions=(
+                "x",
+                "nstr",
+            ),
+        )
+        v[:] = chararray
+        if encoding_str is not None:
+            v._Encoding = encoding_str
+        if INCLUDE_COORD:
+            v.coordinates = "v_co"
+
+
+def show_result(filepath):
+    from pp_utils import ncdump
+
+    print(f"File {filepath}")
+    print("NCDUMP:")
+    ncdump(filepath, "")
+    # with nc.Dataset(filepath, "r") as ds:
+    #     v = ds.variables["v"]
+    #     print("\n----\nNetcdf data readback (basic)")
+    #     try:
+    #         print(repr(v[:]))
+    #     except UnicodeDecodeError as err:
+    #         print(repr(err))
+    #     print("..raw:")
+    #     v.set_auto_chartostring(False)
+    #     print(repr(v[:]))
+    print("\nAs iris cube..")
+    try:
+        cube = iris.load_cube(filepath)
+        print(cube)
+        if iris.loading.LOAD_PROBLEMS._problems:
+            print(iris.loading.LOAD_PROBLEMS)
+            print(
+                "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format())
+            )
+        print("-data-")
+        print(repr(cube.data))
+        if INCLUDE_COORD:
+            print("-coord data-")
+            try:
+                print(repr(cube.coord("v_co").points))
+            except Exception as err2:
+                print(repr(err2))
+    except UnicodeDecodeError as err:
+        print(repr(err))
+
+
+# tsts = (None, "ascii", "utf-8", "utf-32",)
+# tsts = ("utf-8",)
+# tsts = ("utf-8", "utf-32",)
+# tsts = ("utf-32",)
+tsts = ("utf-8", "ascii", "utf-8")
+
+
+@pytest.mark.parametrize("encoding", tsts)
+def test_encodings(encoding):
+    print(f"\n=========\nTesting encoding: {encoding}")
+    filepath = f"tmp_{str(encoding)}.nc"
+    do_as = encoding
+    if encoding != "utf-32":
+        do_as = "utf-8"
+    TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as)
+    TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as)
+    make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
+    show_result(filepath)

From 65bd9ddfbca73597a86a8059f53291f2828779b6 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Sat, 25 Oct 2025 01:22:30 +0100
Subject: [PATCH 02/43] Get 'create_cf_data_variable' to call
 'create_generic_cf_array_var': Mostly working?

Get 'create_cf_data_variable' to call 'create_generic_cf_array_var': Mostly working?
---
 .../fileformats/_nc_load_rules/helpers.py     |   8 +-
 lib/iris/fileformats/netcdf/saver.py          | 158 +++++++++---------
 .../integration/netcdf/test_chararrays.py     |   1 +
 3 files changed, 85 insertions(+), 82 deletions(-)

diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py
index 35c2e96924..50e282db5f 100644
--- a/lib/iris/fileformats/_nc_load_rules/helpers.py
+++ b/lib/iris/fileformats/_nc_load_rules/helpers.py
@@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine):
             ),
         )
         if problem is not None:
-            stack_notes = problem.stack_trace.__notes__
+            stack_notes = problem.stack_trace.__notes__  # type: ignore[attr-defined]
             if stack_notes is None:
                 stack_notes = []
             stack_notes.append(
                 f"Skipping disallowed global attribute '{attr_name}' (see above error)"
             )
-            problem.stack_trace.__notes__ = stack_notes
+            problem.stack_trace.__notes__ = stack_notes  # type: ignore[attr-defined]
 
 
 ################################################################################
@@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate(
     )
     if problem is not None:
         coord_var_name = str(cf_coord_var.cf_name)
-        stack_notes = problem.stack_trace.__notes__
+        stack_notes = problem.stack_trace.__notes__  # type: ignore[attr-defined]
         if stack_notes is None:
             stack_notes = []
         stack_notes.append(
             f"Failed to create {coord_var_name} dimension coordinate:\n"
             f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead."
         )
-        problem.stack_trace.__notes__ = stack_notes
+        problem.stack_trace.__notes__ = stack_notes  # type: ignore[attr-defined]
         problem.handled = True
 
         _ = _add_or_capture(
diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index 5177749c07..bd4e87471f 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -759,7 +759,7 @@ def _create_cf_dimensions(self, cube, dimension_names, unlimited_dimensions=None
                     # used for a different one
                     pass
                 else:
-                    dim_name = self._get_coord_variable_name(cube, coord)
+                    dim_name = self._get_element_variable_name(cube, coord)
                     unlimited_dim_names.append(dim_name)
 
         for dim_name in dimension_names:
@@ -990,12 +990,12 @@ def _add_aux_coords(
         ]
 
         # Include any relevant mesh location coordinates.
-        mesh: MeshXY | None = getattr(cube, "mesh")
-        mesh_location: str | None = getattr(cube, "location")
+        mesh: MeshXY | None = getattr(cube, "mesh")  # type: ignore[annotation-unchecked]
+        mesh_location: str | None = getattr(cube, "location")  # type: ignore[annotation-unchecked]
         if mesh and mesh_location:
             location_coords: MeshNodeCoords | MeshEdgeCoords | MeshFaceCoords = getattr(
                 mesh, f"{mesh_location}_coords"
-            )
+            )  # type: ignore[annotation-unchecked]
             coords_to_add.extend(list(location_coords))
 
         return self._add_inner_related_vars(
@@ -1365,7 +1365,7 @@ def record_dimension(names_list, dim_name, length, matching_coords=None):
                         if dim_name is None:
                             # Not already present : create  a unique dimension name
                             # from the coord.
-                            dim_name = self._get_coord_variable_name(cube, coord)
+                            dim_name = self._get_element_variable_name(cube, coord)
                             # Disambiguate if it has the same name as an
                             # existing dimension.
                             # OR if it matches an existing file variable name.
@@ -1541,38 +1541,14 @@ def _create_cf_bounds(self, coord, cf_var, cf_name, /, *, compression_kwargs=Non
             )
             self._lazy_stream_data(data=bounds, cf_var=cf_var_bounds)
 
-    def _get_cube_variable_name(self, cube):
-        """Return a CF-netCDF variable name for the given cube.
-
-        Parameters
-        ----------
-        cube : :class:`iris.cube.Cube`
-            An instance of a cube for which a CF-netCDF variable
-            name is required.
-
-        Returns
-        -------
-        str
-            A CF-netCDF variable name as a string.
-
-        """
-        if cube.var_name is not None:
-            cf_name = cube.var_name
-        else:
-            # Convert to lower case and replace whitespace by underscores.
-            cf_name = "_".join(cube.name().lower().split())
-
-        cf_name = self.cf_valid_var_name(cf_name)
-        return cf_name
-
-    def _get_coord_variable_name(self, cube_or_mesh, coord):
-        """Return a CF-netCDF variable name for a given coordinate-like element.
+    def _get_element_variable_name(self, cube_or_mesh, element):
+        """Return a CF-netCDF variable name for a given coordinate-like element, or cube.
 
         Parameters
         ----------
         cube_or_mesh : :class:`iris.cube.Cube` or :class:`iris.mesh.MeshXY`
             The Cube or Mesh being saved to the netCDF file.
-        coord : :class:`iris.coords._DimensionalMetadata`
+        element : :class:`iris.coords._DimensionalMetadata` | :class:``iris.cube.Cube``
             An instance of a coordinate (or similar), for which a CF-netCDF
             variable name is required.
 
@@ -1592,17 +1568,21 @@ def _get_coord_variable_name(self, cube_or_mesh, coord):
             cube = None
             mesh = cube_or_mesh
 
-        if coord.var_name is not None:
-            cf_name = coord.var_name
+        if element.var_name is not None:
+            cf_name = element.var_name
+        elif isinstance(element, Cube):
+            # Make name for a Cube without a var_name.
+            cf_name = "_".join(element.name().lower().split())
         else:
-            name = coord.standard_name or coord.long_name
+            # Make name for a Coord-like element without a var_name
+            name = element.standard_name or element.long_name
             if not name or set(name).intersection(string.whitespace):
                 # We need to invent a name, based on its associated dimensions.
-                if cube is not None and cube.coords(coord):
-                    # It is a regular cube coordinate.
+                if cube is not None and cube.elements(element):
+                    # It is a regular cube elementinate.
                     # Auto-generate a name based on the dims.
                     name = ""
-                    for dim in cube.coord_dims(coord):
+                    for dim in cube.coord_dims(element):
                         name += f"dim{dim}"
                     # Handle scalar coordinate (dims == ()).
                     if not name:
@@ -1616,8 +1596,8 @@ def _get_coord_variable_name(self, cube_or_mesh, coord):
 
                     # At present, a location-coord cannot be nameless, as the
                     # MeshXY code relies on guess_coord_axis.
-                    assert isinstance(coord, Connectivity)
-                    location = coord.cf_role.split("_")[0]
+                    assert isinstance(element, Connectivity)
+                    location = element.cf_role.split("_")[0]
                     location_dim_attr = f"{location}_dimension"
                     name = getattr(mesh, location_dim_attr)
 
@@ -1693,6 +1673,8 @@ def _create_mesh(self, mesh):
         return cf_mesh_name
 
     def _set_cf_var_attributes(self, cf_var, element):
+        from iris.cube import Cube
+
         # Deal with CF-netCDF units, and add the name+units properties.
         if isinstance(element, iris.coords.Coord):
             # Fix "degree" units if needed.
@@ -1715,19 +1697,21 @@ def _set_cf_var_attributes(self, cf_var, element):
         if element.units.calendar:
             _setncattr(cf_var, "calendar", str(element.units.calendar))
 
-        # Add any other custom coordinate attributes.
-        for name in sorted(element.attributes):
-            value = element.attributes[name]
+        if not isinstance(element, Cube):
+            # Add any other custom coordinate attributes.
+            # N.B. not Cube, which has specific handling in  _create_cf_data_variable
+            for name in sorted(element.attributes):
+                value = element.attributes[name]
 
-            if name == "STASH":
-                # Adopting provisional Metadata Conventions for representing MO
-                # Scientific Data encoded in NetCDF Format.
-                name = "um_stash_source"
-                value = str(value)
+                if name == "STASH":
+                    # Adopting provisional Metadata Conventions for representing MO
+                    # Scientific Data encoded in NetCDF Format.
+                    name = "um_stash_source"
+                    value = str(value)
 
-            # Don't clobber existing attributes.
-            if not hasattr(cf_var, name):
-                _setncattr(cf_var, name, value)
+                # Don't clobber existing attributes.
+                if not hasattr(cf_var, name):
+                    _setncattr(cf_var, name, value)
 
     def _create_generic_cf_array_var(
         self,
@@ -1739,6 +1723,7 @@ def _create_generic_cf_array_var(
         element_dims=None,
         fill_value=None,
         compression_kwargs=None,
+        is_dataless=False,
     ):
         """Create theCF-netCDF variable given dimensional_metadata.
 
@@ -1791,7 +1776,7 @@ def _create_generic_cf_array_var(
 
         # Work out the var-name to use.
         # N.B. the only part of this routine that may use a mesh _or_ a cube.
-        cf_name = self._get_coord_variable_name(cube_or_mesh, element)
+        cf_name = self._get_element_variable_name(cube_or_mesh, element)
         while cf_name in self._dataset.variables:
             cf_name = self._increment_name(cf_name)
 
@@ -1804,10 +1789,13 @@ def _create_generic_cf_array_var(
         # Get the data values, in a way which works for any element type, as
         # all are subclasses of _DimensionalMetadata.
         # (e.g. =points if a coord, =data if an ancillary, etc)
-        data = element._core_values()
+        if isinstance(element, Cube):
+            data = element.core_data()
+        else:
+            data = element._core_values()
 
         # This compression contract is *not* applicable to a mesh.
-        if cube and cube.shape != data.shape:
+        if cube is not None and data is not None and cube.shape != data.shape:
             compression_kwargs = {}
 
         if np.issubdtype(data.dtype, np.str_):
@@ -1837,11 +1825,13 @@ def _create_generic_cf_array_var(
             # Convert data from an array of strings into a character array
             # with an extra string-length dimension.
             if len(element_dims) == 1:
+                # Scalar variable (only has string dimension).
                 data_first = data[0]
                 if is_lazy_data(data_first):
                     data_first = dask.compute(data_first)
                 data = list("%- *s" % (string_dimension_depth, data_first))
             else:
+                # NOTE: at present, can't do this lazily??
                 orig_shape = data.shape
                 new_shape = orig_shape + (string_dimension_depth,)
                 new_data = np.zeros(new_shape, cf_var.dtype)
@@ -1850,7 +1840,7 @@ def _create_generic_cf_array_var(
                     new_data[index_slice] = list(
                         "%- *s" % (string_dimension_depth, data[index])
                     )
-                data = new_data
+                    data = new_data
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.
@@ -1887,7 +1877,8 @@ def _create_generic_cf_array_var(
             )
 
         # Add the data to the CF-netCDF variable.
-        self._lazy_stream_data(data=data, cf_var=cf_var)
+        if not is_dataless:
+            self._lazy_stream_data(data=data, cf_var=cf_var)
 
         # Add names + units
         self._set_cf_var_attributes(cf_var, element)
@@ -2238,9 +2229,9 @@ def _create_cf_grid_mapping(self, cube, cf_var_cube):
                     cfvar = self._name_coord_map.name(coord)
                     if not cfvar:
                         # not found - create and store it:
-                        cfvar = self._get_coord_variable_name(cube, coord)
+                        cfvar = self._get_element_variable_name(cube, coord)
                         self._name_coord_map.append(
-                            cfvar, self._get_coord_variable_name(cube, coord)
+                            cfvar, self._get_element_variable_name(cube, coord)
                         )
                     cfvar_names.append(cfvar)
 
@@ -2383,32 +2374,43 @@ def set_packing_ncattrs(cfvar):
                 if add_offset:
                     _setncattr(cfvar, "add_offset", add_offset)
 
-        cf_name = self._get_cube_variable_name(cube)
-        while cf_name in self._dataset.variables:
-            cf_name = self._increment_name(cf_name)
-
+        # cf_name = self._get_element_variable_name(cube_or_mesh=None, element=cube)
+        # while cf_name in self._dataset.variables:
+        #     cf_name = self._increment_name(cf_name)
+        #
+        # cf_var = self._dataset.createVariable(
+        #     cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs
+        # )
         # Create the cube CF-netCDF data variable with data payload.
-        cf_var = self._dataset.createVariable(
-            cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs
+        cf_name = self._create_generic_cf_array_var(
+            cube,
+            dimension_names,
+            cube,
+            element_dims=dimension_names,
+            fill_value=fill_value,
+            compression_kwargs=kwargs,
+            is_dataless=is_dataless,
         )
+        cf_var = self._dataset.variables[cf_name]
 
         if not is_dataless:
             set_packing_ncattrs(cf_var)
-            self._lazy_stream_data(data=data, cf_var=cf_var)
-
-        if cube.standard_name:
-            _setncattr(cf_var, "standard_name", cube.standard_name)
-
-        if cube.long_name:
-            _setncattr(cf_var, "long_name", cube.long_name)
-
-        if cube.units.is_udunits():
-            _setncattr(cf_var, "units", str(cube.units))
-
-        # Add the CF-netCDF calendar attribute.
-        if cube.units.calendar:
-            _setncattr(cf_var, "calendar", cube.units.calendar)
 
+        # if cube.standard_name:
+        #     _setncattr(cf_var, "standard_name", cube.standard_name)
+        #
+        # if cube.long_name:
+        #     _setncattr(cf_var, "long_name", cube.long_name)
+        #
+        # if cube.units.is_udunits():
+        #     _setncattr(cf_var, "units", str(cube.units))
+        #
+        # # Add the CF-netCDF calendar attribute.
+        # if cube.units.calendar:
+        #     _setncattr(cf_var, "calendar", cube.units.calendar)
+
+        # Set attributes: NB this part is cube-specific (not the same for components)
+        # - therefore 'set_cf_var_attributes' doesn't set attributes if element is a Cube
         if iris.FUTURE.save_split_attrs:
             attr_names = cube.attributes.locals.keys()
         else:
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index feb93047dd..a3ce9f9128 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -101,6 +101,7 @@ def show_result(filepath):
 
 @pytest.mark.parametrize("encoding", tsts)
 def test_encodings(encoding):
+    # small change
     print(f"\n=========\nTesting encoding: {encoding}")
     filepath = f"tmp_{str(encoding)}.nc"
     do_as = encoding

From d75a7a79831977de341a17f8a2a11d4ee276c902 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Tue, 28 Oct 2025 21:11:15 +0000
Subject: [PATCH 03/43] Reinstate decode on load, now in-Iris coded.

---
 .../fileformats/_nc_load_rules/helpers.py     | 10 ++-
 lib/iris/fileformats/cf.py                    | 18 +++++-
 .../fileformats/netcdf/_thread_safe_nc.py     | 45 +++++++++++--
 lib/iris/fileformats/netcdf/loader.py         | 38 ++++++++++-
 lib/iris/fileformats/netcdf/saver.py          |  4 +-
 .../integration/netcdf/test_chararrays.py     | 64 ++++++++++++++++++-
 lib/iris/util.py                              | 21 ++++++
 7 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py
index 50e282db5f..fa63002f09 100644
--- a/lib/iris/fileformats/_nc_load_rules/helpers.py
+++ b/lib/iris/fileformats/_nc_load_rules/helpers.py
@@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate(
 
     # Determine the name of the dimension/s shared between the CF-netCDF data variable
     # and the coordinate being built.
-    common_dims = [
-        dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions
-    ]
+    coord_dims = cf_coord_var.dimensions
+    if cf._is_str_dtype(cf_coord_var):
+        coord_dims = coord_dims[:-1]
+    datavar_dims = engine.cf_var.dimensions
+    if cf._is_str_dtype(engine.cf_var):
+        datavar_dims = datavar_dims[:-1]
+    common_dims = [dim for dim in coord_dims if dim in datavar_dims]
     data_dims = None
     if common_dims:
         # Calculate the offset of each common dimension.
diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
index b65ab70792..5abc525109 100644
--- a/lib/iris/fileformats/cf.py
+++ b/lib/iris/fileformats/cf.py
@@ -790,15 +790,27 @@ def cf_label_data(self, cf_data_var):
 
         # Determine the name of the label string (or length) dimension by
         # finding the dimension name that doesn't exist within the data dimensions.
-        str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions))
+        str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
+        n_nondata_dims = len(str_dim_names)
+
+        if n_nondata_dims == 0:
+            # *All* dims are shared with the data-variable.
+            # This is only ok if the data-var is *also* a string type.
+            dim_ok = _is_str_dtype(cf_data_var)
+            # In this case, we must just *assume* that the last dimension is "the"
+            #  string dimension
+            str_dim_name = self.dimensions[-1]
+        else:
+            # If there is exactly one non-data dim, that is the one we want
+            dim_ok = len(str_dim_names) == 1
+            (str_dim_name,) = str_dim_names
 
-        if len(str_dim_name) != 1:
+        if not dim_ok:
             raise ValueError(
                 "Invalid string dimensions for CF-netCDF label variable %r"
                 % self.cf_name
             )
 
-        str_dim_name = str_dim_name[0]
         label_data = self[:]
 
         if ma.isMaskedArray(label_data):
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
index 33183ef0fa..4b3dc10620 100644
--- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py
+++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -311,14 +311,39 @@ def fromcdl(cls, *args, **kwargs):
 class NetCDFDataProxy:
     """A reference to the data payload of a single NetCDF file variable."""
 
-    __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")
-
-    def __init__(self, shape, dtype, path, variable_name, fill_value):
+    __slots__ = (
+        "shape",
+        "dtype",
+        "path",
+        "variable_name",
+        "fill_value",
+        "is_bytes",
+        "encoding",
+        "string_length",
+    )
+
+    def __init__(
+        self,
+        shape,
+        dtype,
+        path,
+        variable_name,
+        fill_value,
+        encoding: str | None = None,
+        string_length: int = 0,
+    ):
         self.shape = shape
         self.dtype = dtype
         self.path = path
         self.variable_name = variable_name
         self.fill_value = fill_value
+        self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1
+        if self.is_bytes:
+            # We will be returning a different shape : the last dim is the byte-length
+            self.shape = self.shape[:-1]
+            self.dtype = np.dtype(f"U{string_length}")
+        self.encoding = encoding
+        self.string_length = string_length
 
     @property
     def ndim(self):
@@ -338,10 +363,20 @@ def __getitem__(self, keys):
             try:
                 variable = dataset.variables[self.variable_name]
                 # Get the NetCDF variable data and slice.
-                var = variable[keys]
+                data = variable[keys]
+
+                # If bytes, decode to strings
+                if self.is_bytes:
+                    from iris.util import convert_bytesarray_to_strings
+
+                    data = convert_bytesarray_to_strings(
+                        data,
+                        encoding=self.encoding,
+                        string_length=self.string_length,
+                    )
             finally:
                 dataset.close()
-        return np.asanyarray(var)
+        return np.asanyarray(data)
 
     def __repr__(self):
         fmt = (
diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
index 219f681e67..d27c3b64b8 100644
--- a/lib/iris/fileformats/netcdf/loader.py
+++ b/lib/iris/fileformats/netcdf/loader.py
@@ -11,6 +11,7 @@
 
 """
 
+import codecs
 from collections.abc import Iterable, Iterator, Mapping
 from contextlib import contextmanager
 from copy import deepcopy
@@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var):
             # Normal NCVariable type:
             total_bytes = cf_var.size * cf_var.dtype.itemsize
 
+        default_encoding = "utf-8"
+        encoding = getattr(cf_var, "_Encoding", None)
+        if encoding is None:
+            # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+            encoding = default_encoding
+        else:
+            try:
+                # Accept + normalise naming of encodings
+                encoding = codecs.lookup(encoding).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Replace some invalid setting with "safe"(ish) fallback.
+                encoding = default_encoding
+
+        string_length = getattr(cf_var, "iris_string_length", None)
+
         if total_bytes < _LAZYVAR_MIN_BYTES:
             # Don't make a lazy array, as it will cost more memory AND more time to access.
             result = cf_var[:]
 
+            if result.dtype.kind == "S":
+                from iris.util import convert_bytesarray_to_strings
+
+                result = convert_bytesarray_to_strings(
+                    result,
+                    encoding=encoding,
+                    string_length=string_length,
+                )
+
             # Special handling of masked scalar value; this will be returned as
             # an `np.ma.masked` instance which will lose the original dtype.
             # Workaround for this it return a 1-element masked array of the
@@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var):
                 "_FillValue",
                 _thread_safe_nc.default_fillvals[fill_dtype],
             )
+
+            # NOTE: if the data is bytes which need to be converted to strings on read,
+            #  the data-proxy will do that (and it modifies its shape + dtype).
             proxy = NetCDFDataProxy(
-                cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
+                cf_var.shape,
+                dtype,
+                cf_var.filename,
+                cf_var.cf_name,
+                fill_value,
+                encoding=encoding,
+                string_length=string_length,
             )
             # Get the chunking specified for the variable : this is either a shape, or
             # maybe the string "contiguous".
diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index bd4e87471f..d885387a7f 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element):
             name = element.standard_name or element.long_name
             if not name or set(name).intersection(string.whitespace):
                 # We need to invent a name, based on its associated dimensions.
-                if cube is not None and cube.elements(element):
-                    # It is a regular cube elementinate.
+                if cube is not None and cube.coords(element):
+                    # It is a regular cube coordinate.
                     # Auto-generate a name based on the dims.
                     name = ""
                     for dim in cube.coord_dims(element):
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index a3ce9f9128..8f29fcdcd5 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -4,10 +4,18 @@
 
 import iris
 
+iris.FUTURE.save_split_attrs = True
+
+
 NX, N_STRLEN = 3, 64
 TEST_STRINGS = ["Münster", "London", "Amsterdam"]
 TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
 
+# VARS_COORDS_SHARE_STRING_DIM = True
+VARS_COORDS_SHARE_STRING_DIM = False
+if VARS_COORDS_SHARE_STRING_DIM:
+    TEST_COORD_VALS[-1] = "Xsandwich"  # makes the max coord strlen same as data one
+
 
 def convert_chararray(string_array_1d, maxlen, encoding="utf-8"):
     bbytes = [text.encode(encoding) for text in string_array_1d]
@@ -17,9 +25,33 @@ def convert_chararray(string_array_1d, maxlen, encoding="utf-8"):
     return chararray
 
 
+def convert_bytesarray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
 INCLUDE_COORD = True
 # INCLUDE_COORD = False
 
+INCLUDE_NUMERIC_AUXCOORD = True
+# INCLUDE_NUMERIC_AUXCOORD = False
+
 
 def make_testfile(filepath, chararray, coordarray, encoding_str=None):
     with nc.Dataset(filepath, "w") as ds:
@@ -40,6 +72,13 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
             v_co[:] = coordarray
             if encoding_str is not None:
                 v_co._Encoding = encoding_str
+            if INCLUDE_NUMERIC_AUXCOORD:
+                v_num = ds.createVariable(
+                    "v_num",
+                    float,
+                    dimensions=("x",),
+                )
+                v_num[:] = np.arange(NX)
         v = ds.createVariable(
             "v",
             "S1",
@@ -52,7 +91,10 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
         if encoding_str is not None:
             v._Encoding = encoding_str
         if INCLUDE_COORD:
-            v.coordinates = "v_co"
+            coords_str = "v_co"
+            if INCLUDE_NUMERIC_AUXCOORD:
+                coords_str += " v_num"
+            v.coordinates = coords_str
 
 
 def show_result(filepath):
@@ -82,8 +124,10 @@ def show_result(filepath):
             )
         print("-data-")
         print(repr(cube.data))
+        print("-numeric auxcoord data-")
+        print(repr(cube.coord("x").points))
         if INCLUDE_COORD:
-            print("-coord data-")
+            print("-string auxcoord data-")
             try:
                 print(repr(cube.coord("v_co").points))
             except Exception as err2:
@@ -111,3 +155,19 @@ def test_encodings(encoding):
     TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as)
     make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
     show_result(filepath)
+
+
+# @pytest.mark.parametrize("ndim", [1, 2])
+# def test_convert_bytes_to_strings(ndim: int):
+#     if ndim == 1:
+#         source = convert_strings_to_chararray(TEST_STRINGS, 16)
+#     elif ndim == 2:
+#         source = np.stack([
+#             convert_strings_to_chararray(TEST_STRINGS, 16),
+#             convert_strings_to_chararray(TEST_COORD_VALS, 16),
+#         ])
+#     else:
+#         raise ValueError(f"Unexpected param ndim={ndim}.")
+#     # convert the strings to bytes
+#     result = convert_bytesarray_to_strings(source)
+#     print(result)
diff --git a/lib/iris/util.py b/lib/iris/util.py
index 2c413d2822..193a95c8ce 100644
--- a/lib/iris/util.py
+++ b/lib/iris/util.py
@@ -3183,3 +3183,24 @@ def set(
 
 # Global CML settings object for use as context manager
 CML_SETTINGS: CMLSettings = CMLSettings()
+
+
+def convert_bytesarray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result

From 07efc0634bc699b7d8777d6b343b15e199c02c29 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Sun, 7 Dec 2025 00:34:53 +0000
Subject: [PATCH 04/43] Revert and amend.

---
 .../fileformats/netcdf/_thread_safe_nc.py     | 45 +++----------------
 lib/iris/fileformats/netcdf/loader.py         | 38 +---------------
 lib/iris/fileformats/netcdf/saver.py          |  4 +-
 lib/iris/util.py                              | 21 ---------
 4 files changed, 8 insertions(+), 100 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
index 4b3dc10620..33183ef0fa 100644
--- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py
+++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -311,39 +311,14 @@ def fromcdl(cls, *args, **kwargs):
 class NetCDFDataProxy:
     """A reference to the data payload of a single NetCDF file variable."""
 
-    __slots__ = (
-        "shape",
-        "dtype",
-        "path",
-        "variable_name",
-        "fill_value",
-        "is_bytes",
-        "encoding",
-        "string_length",
-    )
-
-    def __init__(
-        self,
-        shape,
-        dtype,
-        path,
-        variable_name,
-        fill_value,
-        encoding: str | None = None,
-        string_length: int = 0,
-    ):
+    __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")
+
+    def __init__(self, shape, dtype, path, variable_name, fill_value):
         self.shape = shape
         self.dtype = dtype
         self.path = path
         self.variable_name = variable_name
         self.fill_value = fill_value
-        self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1
-        if self.is_bytes:
-            # We will be returning a different shape : the last dim is the byte-length
-            self.shape = self.shape[:-1]
-            self.dtype = np.dtype(f"U{string_length}")
-        self.encoding = encoding
-        self.string_length = string_length
 
     @property
     def ndim(self):
@@ -363,20 +338,10 @@ def __getitem__(self, keys):
             try:
                 variable = dataset.variables[self.variable_name]
                 # Get the NetCDF variable data and slice.
-                data = variable[keys]
-
-                # If bytes, decode to strings
-                if self.is_bytes:
-                    from iris.util import convert_bytesarray_to_strings
-
-                    data = convert_bytesarray_to_strings(
-                        data,
-                        encoding=self.encoding,
-                        string_length=self.string_length,
-                    )
+                var = variable[keys]
             finally:
                 dataset.close()
-        return np.asanyarray(data)
+        return np.asanyarray(var)
 
     def __repr__(self):
         fmt = (
diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
index d27c3b64b8..219f681e67 100644
--- a/lib/iris/fileformats/netcdf/loader.py
+++ b/lib/iris/fileformats/netcdf/loader.py
@@ -11,7 +11,6 @@
 
 """
 
-import codecs
 from collections.abc import Iterable, Iterator, Mapping
 from contextlib import contextmanager
 from copy import deepcopy
@@ -270,36 +269,10 @@ def _get_cf_var_data(cf_var):
             # Normal NCVariable type:
             total_bytes = cf_var.size * cf_var.dtype.itemsize
 
-        default_encoding = "utf-8"
-        encoding = getattr(cf_var, "_Encoding", None)
-        if encoding is None:
-            # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
-            encoding = default_encoding
-        else:
-            try:
-                # Accept + normalise naming of encodings
-                encoding = codecs.lookup(encoding).name
-                # NOTE: if encoding does not suit data, errors can occur.
-                # For example, _Encoding = "ascii", with non-ascii content.
-            except LookupError:
-                # Replace some invalid setting with "safe"(ish) fallback.
-                encoding = default_encoding
-
-        string_length = getattr(cf_var, "iris_string_length", None)
-
         if total_bytes < _LAZYVAR_MIN_BYTES:
             # Don't make a lazy array, as it will cost more memory AND more time to access.
             result = cf_var[:]
 
-            if result.dtype.kind == "S":
-                from iris.util import convert_bytesarray_to_strings
-
-                result = convert_bytesarray_to_strings(
-                    result,
-                    encoding=encoding,
-                    string_length=string_length,
-                )
-
             # Special handling of masked scalar value; this will be returned as
             # an `np.ma.masked` instance which will lose the original dtype.
             # Workaround for this it return a 1-element masked array of the
@@ -322,17 +295,8 @@ def _get_cf_var_data(cf_var):
                 "_FillValue",
                 _thread_safe_nc.default_fillvals[fill_dtype],
             )
-
-            # NOTE: if the data is bytes which need to be converted to strings on read,
-            #  the data-proxy will do that (and it modifies its shape + dtype).
             proxy = NetCDFDataProxy(
-                cf_var.shape,
-                dtype,
-                cf_var.filename,
-                cf_var.cf_name,
-                fill_value,
-                encoding=encoding,
-                string_length=string_length,
+                cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
             )
             # Get the chunking specified for the variable : this is either a shape, or
             # maybe the string "contiguous".
diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index d885387a7f..bd4e87471f 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element):
             name = element.standard_name or element.long_name
             if not name or set(name).intersection(string.whitespace):
                 # We need to invent a name, based on its associated dimensions.
-                if cube is not None and cube.coords(element):
-                    # It is a regular cube coordinate.
+                if cube is not None and cube.elements(element):
+                    # It is a regular cube elementinate.
                     # Auto-generate a name based on the dims.
                     name = ""
                     for dim in cube.coord_dims(element):
diff --git a/lib/iris/util.py b/lib/iris/util.py
index 193a95c8ce..2c413d2822 100644
--- a/lib/iris/util.py
+++ b/lib/iris/util.py
@@ -3183,24 +3183,3 @@ def set(
 
 # Global CML settings object for use as context manager
 CML_SETTINGS: CMLSettings = CMLSettings()
-
-
-def convert_bytesarray_to_strings(
-    byte_array, encoding="utf-8", string_length: int | None = None
-):
-    """Convert bytes to strings.
-
-    N.B. for now at least, we assume the string dim is **always the last one**.
-    """
-    bytes_shape = byte_array.shape
-    var_shape = bytes_shape[:-1]
-    if string_length is None:
-        string_length = bytes_shape[-1]
-    string_dtype = f"U{string_length}"
-    result = np.empty(var_shape, dtype=string_dtype)
-    for ndindex in np.ndindex(var_shape):
-        element_bytes = byte_array[ndindex]
-        bytes = b"".join([b if b else b"\0" for b in element_bytes])
-        string = bytes.decode(encoding)
-        result[ndindex] = string
-    return result

From 232107775c424b30233ad336d8fbfd81913a57c2 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 29 Oct 2025 12:23:07 +0000
Subject: [PATCH 05/43] Hack to preserve the existing order of attributes on
 saved Coords and Cubes.

---
 lib/iris/fileformats/netcdf/saver.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index bd4e87471f..8fb0fec377 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1682,12 +1682,26 @@ def _set_cf_var_attributes(self, cf_var, element):
         else:
             units_str = str(element.units)
 
-        if cf_units.as_unit(units_str).is_udunits():
-            _setncattr(cf_var, "units", units_str)
+        # NB this bit is a nasty hack to preserve existing behaviour through a refactor:
+        #  The attributes for Coords are created in the order units, standard_name,
+        #   whereas for data-variables (aka Cubes) it is the other way around.
+        # Needed now that this routine is also called from _create_cf_data_variable.
+        # TODO: when we can break things, rationalise these to be the same.
+        def add_units():
+            if cf_units.as_unit(units_str).is_udunits():
+                _setncattr(cf_var, "units", units_str)
+
+        def add_stdname():
+            standard_name = element.standard_name
+            if standard_name is not None:
+                _setncattr(cf_var, "standard_name", standard_name)
 
-        standard_name = element.standard_name
-        if standard_name is not None:
-            _setncattr(cf_var, "standard_name", standard_name)
+        if isinstance(element, Cube):
+            add_stdname()
+            add_units()
+        else:
+            add_units()
+            add_stdname()
 
         long_name = element.long_name
         if long_name is not None:

From 0174e53a443167ab0ec47ae826b140d2abd57116 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 29 Oct 2025 14:54:33 +0000
Subject: [PATCH 06/43] Fix for dataless; avoid FUTURE global state change from
 temporary tests.

---
 lib/iris/fileformats/netcdf/saver.py          | 30 ++++----
 .../integration/netcdf/test_chararrays.py     | 72 ++++++++++++++++---
 2 files changed, 75 insertions(+), 27 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index 8fb0fec377..c2522d8867 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1812,7 +1812,7 @@ def _create_generic_cf_array_var(
         if cube is not None and data is not None and cube.shape != data.shape:
             compression_kwargs = {}
 
-        if np.issubdtype(data.dtype, np.str_):
+        if not is_dataless and np.issubdtype(data.dtype, np.str_):
             # Deal with string-type variables.
             # Typically CF label variables, but also possibly ancil-vars ?
             string_dimension_depth = data.dtype.itemsize
@@ -1858,8 +1858,13 @@ def _create_generic_cf_array_var(
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.
-            element_type = type(element).__name__
-            data = self._ensure_valid_dtype(data, element_type, element)
+            if is_dataless:
+                dtype = self._DATALESS_DTYPE
+                fill_value = self._DATALESS_FILLVALUE
+            else:
+                element_type = type(element).__name__
+                data = self._ensure_valid_dtype(data, element_type, element)
+                dtype = data.dtype.newbyteorder("=")
 
             # Check if this is a dim-coord.
             is_dimcoord = cube is not None and element in cube.dim_coords
@@ -1873,7 +1878,7 @@ def _create_generic_cf_array_var(
             # Create the CF-netCDF variable.
             cf_var = self._dataset.createVariable(
                 cf_name,
-                data.dtype.newbyteorder("="),
+                dtype,
                 element_dims,
                 fill_value=fill_value,
                 **compression_kwargs,
@@ -2325,19 +2330,12 @@ def _create_cf_data_variable(
         # be removed.
         # Get the values in a form which is valid for the file format.
         is_dataless = cube.is_dataless()
-        if is_dataless:
-            data = None
-        else:
-            data = self._ensure_valid_dtype(cube.core_data(), "cube", cube)
 
-        if is_dataless:
-            # The variable must have *some* dtype, and it must be maskable
-            dtype = self._DATALESS_DTYPE
-            fill_value = self._DATALESS_FILLVALUE
-        elif not packing:
-            dtype = data.dtype.newbyteorder("=")
-        else:
-            if isinstance(packing, dict):
+        if not is_dataless:
+            data = self._ensure_valid_dtype(cube.core_data(), "cube", cube)
+            if not packing:
+                dtype = data.dtype.newbyteorder("=")
+            elif isinstance(packing, dict):
                 if "dtype" not in packing:
                     msg = "The dtype attribute is required for packing."
                     raise ValueError(msg)
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index 8f29fcdcd5..c8bba94671 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -3,9 +3,8 @@
 import pytest
 
 import iris
-
-iris.FUTURE.save_split_attrs = True
-
+from iris.coords import AuxCoord, DimCoord
+from iris.cube import Cube
 
 NX, N_STRLEN = 3, 64
 TEST_STRINGS = ["Münster", "London", "Amsterdam"]
@@ -17,7 +16,13 @@
     TEST_COORD_VALS[-1] = "Xsandwich"  # makes the max coord strlen same as data one
 
 
-def convert_chararray(string_array_1d, maxlen, encoding="utf-8"):
+@pytest.fixture(scope="module", autouse=True)
+def enable_split_attrs():
+    with iris.FUTURE.context(save_split_attrs=True):
+        yield
+
+
+def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
     bbytes = [text.encode(encoding) for text in string_array_1d]
     pad = b"\0" * maxlen
     bbytes = [(x + pad)[:maxlen] for x in bbytes]
@@ -97,6 +102,23 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
             v.coordinates = coords_str
 
 
+def make_testcube(
+    dataarray,
+    coordarray,  # for now, these are always *string* arrays
+    encoding_str: str | None = None,
+):
+    cube = Cube(dataarray, var_name="v")
+    cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0)
+    if encoding_str is not None:
+        cube.attributes["_Encoding"] = encoding_str
+    if INCLUDE_COORD:
+        co_x = AuxCoord(coordarray, var_name="v_co")
+        if encoding_str is not None:
+            co_x.attributes["_Encoding"] = encoding_str
+        cube.add_aux_coord(co_x, 0)
+    return cube
+
+
 def show_result(filepath):
     from pp_utils import ncdump
 
@@ -115,12 +137,13 @@ def show_result(filepath):
     #     print(repr(v[:]))
     print("\nAs iris cube..")
     try:
+        iris.loading.LOAD_PROBLEMS.reset()
         cube = iris.load_cube(filepath)
         print(cube)
-        if iris.loading.LOAD_PROBLEMS._problems:
+        if iris.loading.LOAD_PROBLEMS.problems:
             print(iris.loading.LOAD_PROBLEMS)
             print(
-                "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format())
+                "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format())
             )
         print("-data-")
         print(repr(cube.data))
@@ -136,27 +159,54 @@ def show_result(filepath):
         print(repr(err))
 
 
-# tsts = (None, "ascii", "utf-8", "utf-32",)
+tsts = (
+    None,
+    "ascii",
+    "utf-8",
+    "utf-32",
+)
 # tsts = ("utf-8",)
 # tsts = ("utf-8", "utf-32",)
 # tsts = ("utf-32",)
-tsts = ("utf-8", "ascii", "utf-8")
+# tsts = ("utf-8", "ascii", "utf-8")
 
 
 @pytest.mark.parametrize("encoding", tsts)
-def test_encodings(encoding):
+def test_load_encodings(encoding):
     # small change
     print(f"\n=========\nTesting encoding: {encoding}")
     filepath = f"tmp_{str(encoding)}.nc"
     do_as = encoding
     if encoding != "utf-32":
         do_as = "utf-8"
-    TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as)
-    TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as)
+    TEST_CHARARRAY = convert_strings_to_chararray(
+        TEST_STRINGS, N_STRLEN, encoding=do_as
+    )
+    TEST_COORDARRAY = convert_strings_to_chararray(
+        TEST_COORD_VALS, N_STRLEN, encoding=do_as
+    )
     make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
     show_result(filepath)
 
 
+@pytest.mark.parametrize("encoding", tsts)
+def test_save_encodings(encoding):
+    cube = make_testcube(
+        dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding
+    )
+    print(cube)
+    filepath = f"tmp_save_{str(encoding)}.nc"
+    if encoding == "ascii":
+        with pytest.raises(
+            UnicodeEncodeError,
+            match="'ascii' codec can't encode character.*not in range",
+        ):
+            iris.save(cube, filepath)
+    else:
+        iris.save(cube, filepath)
+        show_result(filepath)
+
+
 # @pytest.mark.parametrize("ndim", [1, 2])
 # def test_convert_bytes_to_strings(ndim: int):
 #     if ndim == 1:

From 035e28b9785c99dc3ae0df7fda171a37cbc62121 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 29 Oct 2025 15:21:31 +0000
Subject: [PATCH 07/43] Further fix to attribute ordering.

---
 lib/iris/fileformats/netcdf/saver.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index c2522d8867..f80cf154c3 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1687,25 +1687,25 @@ def _set_cf_var_attributes(self, cf_var, element):
         #   whereas for data-variables (aka Cubes) it is the other way around.
         # Needed now that this routine is also called from _create_cf_data_variable.
         # TODO: when we can break things, rationalise these to be the same.
-        def add_units():
+        def add_units_attr():
             if cf_units.as_unit(units_str).is_udunits():
                 _setncattr(cf_var, "units", units_str)
 
-        def add_stdname():
+        def add_names_attrs():
             standard_name = element.standard_name
             if standard_name is not None:
                 _setncattr(cf_var, "standard_name", standard_name)
 
+            long_name = element.long_name
+            if long_name is not None:
+                _setncattr(cf_var, "long_name", long_name)
+
         if isinstance(element, Cube):
-            add_stdname()
-            add_units()
+            add_names_attrs()
+            add_units_attr()
         else:
-            add_units()
-            add_stdname()
-
-        long_name = element.long_name
-        if long_name is not None:
-            _setncattr(cf_var, "long_name", long_name)
+            add_units_attr()
+            add_names_attrs()
 
         # Add the CF-netCDF calendar attribute.
         if element.units.calendar:

From 80c4776b0c5f6cbfd8163d757e16e28124a9b199 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 29 Oct 2025 18:08:35 +0000
Subject: [PATCH 08/43] Fixes for data packing.

---
 lib/iris/fileformats/netcdf/saver.py | 64 ++++++++++------------------
 1 file changed, 22 insertions(+), 42 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index f80cf154c3..8d66557cab 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1737,6 +1737,7 @@ def _create_generic_cf_array_var(
         element_dims=None,
         fill_value=None,
         compression_kwargs=None,
+        packing_controls: dict | None = None,
         is_dataless=False,
     ):
         """Create theCF-netCDF variable given dimensional_metadata.
@@ -1864,7 +1865,10 @@ def _create_generic_cf_array_var(
             else:
                 element_type = type(element).__name__
                 data = self._ensure_valid_dtype(data, element_type, element)
-                dtype = data.dtype.newbyteorder("=")
+                if not packing_controls:
+                    dtype = data.dtype.newbyteorder("=")
+                else:
+                    dtype = packing_controls["dtype"]
 
             # Check if this is a dim-coord.
             is_dimcoord = cube is not None and element in cube.dim_coords
@@ -1897,6 +1901,10 @@ def _create_generic_cf_array_var(
 
         # Add the data to the CF-netCDF variable.
         if not is_dataless:
+            if packing_controls:
+                # We must set packing attributes (if any), before assigning values.
+                for key, value in packing_controls["attributes"]:
+                    _setncattr(cf_var, key, value)
             self._lazy_stream_data(data=data, cf_var=cf_var)
 
         # Add names + units
@@ -2331,11 +2339,10 @@ def _create_cf_data_variable(
         # Get the values in a form which is valid for the file format.
         is_dataless = cube.is_dataless()
 
-        if not is_dataless:
+        packing_controls = None
+        if packing and not is_dataless:
             data = self._ensure_valid_dtype(cube.core_data(), "cube", cube)
-            if not packing:
-                dtype = data.dtype.newbyteorder("=")
-            elif isinstance(packing, dict):
+            if isinstance(packing, dict):
                 if "dtype" not in packing:
                     msg = "The dtype attribute is required for packing."
                     raise ValueError(msg)
@@ -2373,26 +2380,14 @@ def _create_cf_data_variable(
                     else:
                         add_offset = cmin + 2 ** (n - 1) * scale_factor
 
-        def set_packing_ncattrs(cfvar):
-            """Set netCDF packing attributes.
-
-            NOTE: cfvar needs to be a _thread_safe_nc._ThreadSafeWrapper subclass.
+            packing_controls = {
+                "dtype": dtype,
+                "attributes": [
+                    ("scale_factor", scale_factor),
+                    ("add_offset", add_offset),
+                ],
+            }
 
-            """
-            assert hasattr(cfvar, "THREAD_SAFE_FLAG")
-            if packing:
-                if scale_factor:
-                    _setncattr(cfvar, "scale_factor", scale_factor)
-                if add_offset:
-                    _setncattr(cfvar, "add_offset", add_offset)
-
-        # cf_name = self._get_element_variable_name(cube_or_mesh=None, element=cube)
-        # while cf_name in self._dataset.variables:
-        #     cf_name = self._increment_name(cf_name)
-        #
-        # cf_var = self._dataset.createVariable(
-        #     cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs
-        # )
         # Create the cube CF-netCDF data variable with data payload.
         cf_name = self._create_generic_cf_array_var(
             cube,
@@ -2401,28 +2396,13 @@ def set_packing_ncattrs(cfvar):
             element_dims=dimension_names,
             fill_value=fill_value,
             compression_kwargs=kwargs,
+            packing_controls=packing_controls,
             is_dataless=is_dataless,
         )
         cf_var = self._dataset.variables[cf_name]
 
-        if not is_dataless:
-            set_packing_ncattrs(cf_var)
-
-        # if cube.standard_name:
-        #     _setncattr(cf_var, "standard_name", cube.standard_name)
-        #
-        # if cube.long_name:
-        #     _setncattr(cf_var, "long_name", cube.long_name)
-        #
-        # if cube.units.is_udunits():
-        #     _setncattr(cf_var, "units", str(cube.units))
-        #
-        # # Add the CF-netCDF calendar attribute.
-        # if cube.units.calendar:
-        #     _setncattr(cf_var, "calendar", cube.units.calendar)
-
-        # Set attributes: NB this part is cube-specific (not the same for components)
-        # - therefore 'set_cf_var_attributes' doesn't set attributes if element is a Cube
+        # Set general attrs: NB this part is cube-specific (not the same for components)
+        # - so 'set_cf_var_attributes' *doesn't* set these, if element is a Cube
         if iris.FUTURE.save_split_attrs:
             attr_names = cube.attributes.locals.keys()
         else:

From d4d3ebd2ac7e6414a3f2c57912138f4c02cf1ed9 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Sun, 7 Dec 2025 00:42:34 +0000
Subject: [PATCH 09/43] Latest test-chararrays.

---
 .../integration/netcdf/test_chararrays.py     | 61 +++++++++++--------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index c8bba94671..0eb211c8b0 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -1,10 +1,19 @@
-import netCDF4 as nc
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Integration tests for string data handling."""
+
+import subprocess
+
 import numpy as np
 import pytest
 
 import iris
 from iris.coords import AuxCoord, DimCoord
 from iris.cube import Cube
+from iris.fileformats.netcdf import _thread_safe_nc
+from iris.tests import env_bin_path
 
 NX, N_STRLEN = 3, 64
 TEST_STRINGS = ["Münster", "London", "Amsterdam"]
@@ -16,6 +25,7 @@
     TEST_COORD_VALS[-1] = "Xsandwich"  # makes the max coord strlen same as data one
 
 
+# Ensure all tests run with "split attrs" turned on.
 @pytest.fixture(scope="module", autouse=True)
 def enable_split_attrs():
     with iris.FUTURE.context(save_split_attrs=True):
@@ -59,7 +69,8 @@ def convert_bytesarray_to_strings(
 
 
 def make_testfile(filepath, chararray, coordarray, encoding_str=None):
-    with nc.Dataset(filepath, "w") as ds:
+    ds = _thread_safe_nc.DatasetWrapper(filepath, "w")
+    try:
         ds.createDimension("x", NX)
         ds.createDimension("nstr", N_STRLEN)
         vx = ds.createVariable("x", int, dimensions=("x"))
@@ -100,6 +111,8 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
             if INCLUDE_NUMERIC_AUXCOORD:
                 coords_str += " v_num"
             v.coordinates = coords_str
+    finally:
+        ds.close()
 
 
 def make_testcube(
@@ -119,12 +132,19 @@ def make_testcube(
     return cube
 
 
-def show_result(filepath):
-    from pp_utils import ncdump
+NCDUMP_PATHSTR = str(env_bin_path("ncdump"))
+
+
+def ncdump(nc_path: str, *args):
+    """Call ncdump to print a dump of a file."""
+    call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
+    subprocess.run(call_args, check=True)
 
+
+def show_result(filepath):
     print(f"File {filepath}")
     print("NCDUMP:")
-    ncdump(filepath, "")
+    ncdump(filepath)
     # with nc.Dataset(filepath, "r") as ds:
     #     v = ds.variables["v"]
     #     print("\n----\nNetcdf data readback (basic)")
@@ -159,6 +179,13 @@ def show_result(filepath):
         print(repr(err))
 
 
+@pytest.fixture(scope="session")
+def save_dir(tmp_path_factory):
+    return tmp_path_factory.mktemp("save_files")
+
+
+# TODO: the tests don't test things properly yet, they just exercise the code and print
+#  things for manual debugging.
 tsts = (
     None,
     "ascii",
@@ -172,10 +199,10 @@ def show_result(filepath):
 
 
 @pytest.mark.parametrize("encoding", tsts)
-def test_load_encodings(encoding):
+def test_load_encodings(encoding, save_dir):
     # small change
     print(f"\n=========\nTesting encoding: {encoding}")
-    filepath = f"tmp_{str(encoding)}.nc"
+    filepath = save_dir / f"tmp_load_{str(encoding)}.nc"
     do_as = encoding
     if encoding != "utf-32":
         do_as = "utf-8"
@@ -190,12 +217,12 @@ def test_load_encodings(encoding):
 
 
 @pytest.mark.parametrize("encoding", tsts)
-def test_save_encodings(encoding):
+def test_save_encodings(encoding, save_dir):
     cube = make_testcube(
         dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding
     )
     print(cube)
-    filepath = f"tmp_save_{str(encoding)}.nc"
+    filepath = save_dir / f"tmp_save_{str(encoding)}.nc"
     if encoding == "ascii":
         with pytest.raises(
             UnicodeEncodeError,
@@ -205,19 +232,3 @@ def test_save_encodings(encoding):
     else:
         iris.save(cube, filepath)
         show_result(filepath)
-
-
-# @pytest.mark.parametrize("ndim", [1, 2])
-# def test_convert_bytes_to_strings(ndim: int):
-#     if ndim == 1:
-#         source = convert_strings_to_chararray(TEST_STRINGS, 16)
-#     elif ndim == 2:
-#         source = np.stack([
-#             convert_strings_to_chararray(TEST_STRINGS, 16),
-#             convert_strings_to_chararray(TEST_COORD_VALS, 16),
-#         ])
-#     else:
-#         raise ValueError(f"Unexpected param ndim={ndim}.")
-#     # convert the strings to bytes
-#     result = convert_bytesarray_to_strings(source)
-#     print(result)

From 3f10cc1c49d207422f6ce4c2e64b2b44d9f1513c Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Sun, 7 Dec 2025 10:43:18 +0000
Subject: [PATCH 10/43] Fix search+replace error.

---
 lib/iris/fileformats/netcdf/saver.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index 8d66557cab..4766054142 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element):
             name = element.standard_name or element.long_name
             if not name or set(name).intersection(string.whitespace):
                 # We need to invent a name, based on its associated dimensions.
-                if cube is not None and cube.elements(element):
-                    # It is a regular cube elementinate.
+                if cube is not None and cube.coords(element):
+                    # It is a regular cube coordinate.
                     # Auto-generate a name based on the dims.
                     name = ""
                     for dim in cube.coord_dims(element):

From ee2fe4ccac13c0968c0ff017cb8ef5da8498f852 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 14 Jan 2026 13:18:07 +0000
Subject: [PATCH 11/43] Tiny fix in crucial place! (merge error?).

---
 lib/iris/fileformats/netcdf/saver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index 4766054142..3d7c1dee19 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1855,7 +1855,7 @@ def _create_generic_cf_array_var(
                     new_data[index_slice] = list(
                         "%- *s" % (string_dimension_depth, data[index])
                     )
-                    data = new_data
+                data = new_data
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.

From 744826da3a41a2e727b05cff5ede3b8186dbc62c Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 14 Jan 2026 15:33:04 +0000
Subject: [PATCH 12/43] Extra mock property prevents weird test crashes.

---
 .../helpers/test_build_and_add_auxiliary_coordinate.py    | 8 +++++---
 .../helpers/test_build_and_add_dimension_coordinate.py    | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py
index a44986ec98..5ed3413409 100644
--- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py
+++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py
@@ -44,7 +44,9 @@ def setUp(self):
 
         self.engine = mock.Mock(
             cube=mock.Mock(),
-            cf_var=mock.Mock(dimensions=("foo", "bar"), cf_data=cf_data),
+            cf_var=mock.Mock(
+                dimensions=("foo", "bar"), cf_data=cf_data, dtype=np.int32
+            ),
             filename="DUMMY",
             cube_parts=dict(coordinates=[]),
         )
@@ -174,7 +176,7 @@ def setUp(self):
 
         self.engine = mock.Mock(
             cube=mock.Mock(),
-            cf_var=mock.Mock(dimensions=("foo", "bar")),
+            cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.int32),
             filename="DUMMY",
             cube_parts=dict(coordinates=[]),
         )
@@ -244,7 +246,7 @@ def setUp(self):
         # Create dummy pyke engine.
         self.engine = mock.Mock(
             cube=mock.Mock(),
-            cf_var=mock.Mock(dimensions=("foo", "bar")),
+            cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.float32),
             filename="DUMMY",
             cube_parts=dict(coordinates=[]),
         )
diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py
index a871c967ab..26e25a6d95 100644
--- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py
+++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_dimension_coordinate.py
@@ -50,7 +50,7 @@ def setUp(self):
         # Create dummy pyke engine.
         self.engine = mock.Mock(
             cube=mock.Mock(),
-            cf_var=mock.Mock(dimensions=("foo", "bar")),
+            cf_var=mock.Mock(dimensions=("foo", "bar"), dtype=np.int32),
             filename="DUMMY",
             cube_parts=dict(coordinates=[]),
         )

From a3e1217345f52f5e882b31ebdf65e76f7fe406b9 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 14 Jan 2026 18:00:24 +0000
Subject: [PATCH 13/43] Fix another mock problem.

---
 .../fileformats/netcdf/saver/test_Saver.py    | 40 +++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py
index 0905c3d2a9..0eb12d794c 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py
@@ -261,9 +261,6 @@ def test_compression(self):
         )
         cube.add_ancillary_variable(anc_coord, data_dims=data_dims)
 
-        patch = self.patch(
-            "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable"
-        )
         compression_kwargs = {
             "complevel": 9,
             "fletcher32": True,
@@ -273,10 +270,16 @@ def test_compression(self):
 
         with self.temp_filename(suffix=".nc") as nc_path:
             with Saver(nc_path, "NETCDF4", compute=False) as saver:
+                createvar_spy = self.patch(
+                    "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+                    # Use 'wraps' to allow the patched methods to function as normal
+                    #  - the patch object just acts as a 'spy' on its calls.
+                    wraps=saver._dataset.createVariable,
+                )
                 saver.write(cube, **compression_kwargs)
 
-        self.assertEqual(5, patch.call_count)
-        result = self._filter_compression_calls(patch, compression_kwargs)
+        self.assertEqual(5, createvar_spy.call_count)
+        result = self._filter_compression_calls(createvar_spy, compression_kwargs)
         self.assertEqual(3, len(result))
         self.assertEqual({cube.name(), aux_coord.name(), anc_coord.name()}, set(result))
 
@@ -294,9 +297,6 @@ def test_non_compression__shape(self):
         )
         cube.add_ancillary_variable(anc_coord, data_dims=data_dims[1])
 
-        patch = self.patch(
-            "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable"
-        )
         compression_kwargs = {
             "complevel": 9,
             "fletcher32": True,
@@ -306,11 +306,17 @@ def test_non_compression__shape(self):
 
         with self.temp_filename(suffix=".nc") as nc_path:
             with Saver(nc_path, "NETCDF4", compute=False) as saver:
+                createvar_spy = self.patch(
+                    "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+                    # Use 'wraps' to allow the patched methods to function as normal
+                    #  - the patch object just acts as a 'spy' on its calls.
+                    wraps=saver._dataset.createVariable,
+                )
                 saver.write(cube, **compression_kwargs)
 
-        self.assertEqual(5, patch.call_count)
+        self.assertEqual(5, createvar_spy.call_count)
         result = self._filter_compression_calls(
-            patch, compression_kwargs, mismatch=True
+            createvar_spy, compression_kwargs, mismatch=True
         )
         self.assertEqual(4, len(result))
         # the aux coord and ancil variable are not compressed due to shape, and
@@ -327,10 +333,6 @@ def test_non_compression__dtype(self):
         aux_coord = AuxCoord(data, var_name="non_compress_aux", units="1")
         cube.add_aux_coord(aux_coord, data_dims=data_dims)
 
-        patch = self.patch(
-            "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable"
-        )
-        patch.return_value = mock.MagicMock(dtype=np.dtype("S1"))
         compression_kwargs = {
             "complevel": 9,
             "fletcher32": True,
@@ -340,11 +342,17 @@ def test_non_compression__dtype(self):
 
         with self.temp_filename(suffix=".nc") as nc_path:
             with Saver(nc_path, "NETCDF4", compute=False) as saver:
+                createvar_spy = self.patch(
+                    "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+                    # Use 'wraps' to allow the patched methods to function as normal
+                    #  - the patch object just acts as a 'spy' on its calls.
+                    wraps=saver._dataset.createVariable,
+                )
                 saver.write(cube, **compression_kwargs)
 
-        self.assertEqual(4, patch.call_count)
+        self.assertEqual(4, createvar_spy.call_count)
         result = self._filter_compression_calls(
-            patch, compression_kwargs, mismatch=True
+            createvar_spy, compression_kwargs, mismatch=True
         )
         self.assertEqual(3, len(result))
         # the aux coord is not compressed due to its string dtype, and

From 1a4f2f29de1f69f56317d204d5388ff692ef103a Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 31 Oct 2025 15:38:04 +0000
Subject: [PATCH 14/43] Initial dataset wrappers.

Rename; addin parts of old investigation; add temporary notes.
---
 .../netcdf/_bytecoding_datasets.py            | 182 ++++++++++++++
 .../fileformats/netcdf/_thread_safe_nc.py     |  15 +-
 .../integration/netcdf/test_chararrays.py     | 234 ++++++++++++++++++
 .../fileformats/netcdf/encoding_tests.txt     |  18 ++
 .../netcdf/test_bytecoding_datasets.py        |  14 ++
 5 files changed, 457 insertions(+), 6 deletions(-)
 create mode 100644 lib/iris/fileformats/netcdf/_bytecoding_datasets.py
 create mode 100644 lib/iris/tests/integration/netcdf/test_chararrays.py
 create mode 100644 lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
 create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
new file mode 100644
index 0000000000..41e801d103
--- /dev/null
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -0,0 +1,182 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Module providing to netcdf datasets with automatic character encoding.
+
+The requirement is to convert numpy fixed-width unicode arrays on writing to a variable
+which is declared as a byte (character) array with a fixed-length string dimension.
+
+Numpy unicode string arrays are ones with dtypes of the form "U<character-width>".
+Numpy character variables have the dtype "S1", and map to a fixed-length "string
+dimension".
+
+In principle, netCDF4 already performs these translations, but in practice current
+releases are not functional for anything other than "ascii" encoding -- including UTF-8,
+which is the most obvious and desirable "general" solution.
+
+There is also the question of whether we should like to implement UTF-8 as our default.
+Current discussions on this are inconclusive and neither CF conventions nor the NetCDF
+User Guide are definite on what possible values of "_Encoding" are, or what the effective
+default is, even though they do both mention the "_Encoding" attribute as a potential
+way to handle the issue.
+
+Because of this, we interpret as follows:
+  * when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to
+    decode bytes as UTF-8
+  * when writing strings : in the absence of an "_Encoding" attribute (on the Iris
+    cube or coord object), we will attempt to encode data with "ascii" : If this fails,
+    it raise an error prompting the user to supply an "_Encoding" attribute.
+
+Where an "_Encoding" attribute is provided to Iris, we will honour it where possible,
+identifying with "codecs.lookup" :  This means we support the encodings in the Python
+Standard Library, and the name aliases which it recognises.
+
+See:
+
+* known problems https://github.com/Unidata/netcdf4-python/issues/1440
+* suggestions for how this "ought" to work, discussed in the netcdf-c library
+   * https://github.com/Unidata/netcdf-c/issues/402
+
+"""
+
+import codecs
+import warnings
+
+import numpy as np
+
+from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
+
+
+def decode_bytesarray_to_stringarray(
+    byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None
+) -> np.ndarray:
+    """Convert an array of bytes to an array of strings, with one less dimension.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_width is None:
+        string_width = bytes_shape[-1]
+    string_dtype = f"U{string_width}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
+def encode_stringarray_as_bytearray(
+    data: np.ndarray, encoding=None, string_dimension_length: int | None = None
+) -> np.ndarray:
+    """Encode strings as bytearray.
+
+    Note: if 'string_dimension_length' is not given (None), it is set to the longest
+    encoded bytes element.  If 'string_dimension_length' is specified, the last array
+    dimension is set to this and content strings are truncated or extended as required.
+    """
+    element_shape = data.shape
+    max_length = 1  # this is a MINIMUM - i.e. not zero!
+    data_elements = np.zeros(element_shape, dtype=object)
+    for index in np.ndindex(element_shape):
+        data_element = data[index].encode(encoding=encoding)
+        element_length = len(data_element)
+        data_elements[index] = data_element
+        if element_length > max_length:
+            max_length = element_length
+
+    if string_dimension_length is None:
+        string_dimension_length = max_length
+
+    # We already encoded all the strings, but stored them in an object-array as
+    #  we didn't yet know the fixed byte-length to convert to.
+    # Now convert to a fixed-width byte array with an extra string-length dimension
+    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+    right_pad = b"\0" * string_dimension_length
+    for index in np.ndindex(element_shape):
+        bytes = data_elements[index]
+        bytes = (bytes + right_pad)[:string_dimension_length]
+        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
+
+    return result
+
+
+DEFAULT_ENCODING = "utf-8"
+
+
+class EncodedVariable(VariableWrapper):
+    """A variable wrapper that translates variable data according to byte encodings."""
+
+    def __getitem__(self, keys):
+        if self.is_chardata():
+            super().set_auto_chartostring(False)
+
+        data = super().__getitem__(keys)
+
+        if self.is_chardata():
+            encoding = self.get_byte_encoding()
+            strlen = self.get_string_length()
+            data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+
+        return data
+
+    def __setitem__(self, keys, data):
+        if self.is_chardata():
+            encoding = self.get_byte_encoding()
+            strlen = self.get_string_length()
+            if encoding is not None:
+                data = encode_stringarray_as_bytearray(data, encoding, strlen)
+            else:
+                try:
+                    # Check if all characters are valid ascii
+                    data = encode_stringarray_as_bytearray(data, "ascii", strlen)
+                except UnicodeEncodeError:
+                    data = encode_stringarray_as_bytearray(
+                        data, DEFAULT_ENCODING, strlen
+                    )
+                    # As this was necessary, record the new encoding on the variable
+                    self.set_ncattr("_Encoding", DEFAULT_ENCODING)
+                    msg = (
+                        f"Non-ascii data written to label variable {self.name}. "
+                        f"Applied {DEFAULT_ENCODING!r} encoding, "
+                        f"and set attribute _Encoding={DEFAULT_ENCODING!r}."
+                    )
+                    warnings.warn(msg, UserWarning)
+
+            super().set_auto_chartostring(False)
+
+        super().__setitem__(keys, data)
+
+    def is_chardata(self):
+        return np.issubdtype(self.dtype, np.bytes_)
+
+    def get_encoding(self) -> str | None:
+        """Get the effective byte encoding to be used for this variable."""
+        # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+        result = getattr(self, "_Encoding", None)
+        if result is not None:
+            try:
+                # Accept + normalise naming of encodings
+                result = codecs.lookup(result).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Replace some invalid setting with "safe"(ish) fallback.
+                msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
+                warnings.warn(msg, UserWarning)
+
+        return result
+
+    def get_string_length(self):
+        """Return the string-length defined for this variable (or None)."""
+        return getattr(self, "iris_string_length", None)
+
+
+class EncodedDataset(DatasetWrapper):
+    """A specialised DatasetWrapper whose variables perform byte encoding."""
+
+    VAR_WRAPPER_CLS = EncodedVariable
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
index 33183ef0fa..46b8609bb7 100644
--- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py
+++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper):
     CONTAINED_CLASS = netCDF4.Group
     # Note: will also accept a whole Dataset object, but that is OK.
     _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"]
+    # Class to use when creating variable wrappers (default=VariableWrapper).
+    # - needed to support _byte_encoded_data.EncodedDataset.
+    VAR_WRAPPER_CLS = VariableWrapper
 
     # All Group API that returns Dimension(s) is wrapped to instead return
     #  DimensionWrapper(s).
@@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]:
         """
         with _GLOBAL_NETCDF4_LOCK:
             variables_ = self._contained_instance.variables
-        return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()}
+        return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()}
 
     def createVariable(self, *args, **kwargs) -> VariableWrapper:
         """Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK.
@@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper:
         """
         with _GLOBAL_NETCDF4_LOCK:
             new_variable = self._contained_instance.createVariable(*args, **kwargs)
-        return VariableWrapper.from_existing(new_variable)
+        return self.VAR_WRAPPER_CLS.from_existing(new_variable)
 
     def get_variables_by_attributes(
         self, *args, **kwargs
@@ -234,7 +237,7 @@ def get_variables_by_attributes(
             variables_ = list(
                 self._contained_instance.get_variables_by_attributes(*args, **kwargs)
             )
-        return [VariableWrapper.from_existing(v) for v in variables_]
+        return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_]
 
     # All Group API that returns Group(s) is wrapped to instead return
     #  GroupWrapper(s).
@@ -252,7 +255,7 @@ def groups(self):
         """
         with _GLOBAL_NETCDF4_LOCK:
             groups_ = self._contained_instance.groups
-        return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()}
+        return {k: self.__class__.from_existing(v) for k, v in groups_.items()}
 
     @property
     def parent(self):
@@ -268,7 +271,7 @@ def parent(self):
         """
         with _GLOBAL_NETCDF4_LOCK:
             parent_ = self._contained_instance.parent
-        return GroupWrapper.from_existing(parent_)
+        return self.__class__.from_existing(parent_)
 
     def createGroup(self, *args, **kwargs):
         """Call createGroup() from netCDF4.Group/Dataset.
@@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs):
         """
         with _GLOBAL_NETCDF4_LOCK:
             new_group = self._contained_instance.createGroup(*args, **kwargs)
-        return GroupWrapper.from_existing(new_group)
+        return self.__class__.from_existing(new_group)
 
 
 class DatasetWrapper(GroupWrapper):
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
new file mode 100644
index 0000000000..0eb211c8b0
--- /dev/null
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -0,0 +1,234 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Integration tests for string data handling."""
+
+import subprocess
+
+import numpy as np
+import pytest
+
+import iris
+from iris.coords import AuxCoord, DimCoord
+from iris.cube import Cube
+from iris.fileformats.netcdf import _thread_safe_nc
+from iris.tests import env_bin_path
+
+NX, N_STRLEN = 3, 64
+TEST_STRINGS = ["Münster", "London", "Amsterdam"]
+TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
+
+# VARS_COORDS_SHARE_STRING_DIM = True
+VARS_COORDS_SHARE_STRING_DIM = False
+if VARS_COORDS_SHARE_STRING_DIM:
+    TEST_COORD_VALS[-1] = "Xsandwich"  # makes the max coord strlen same as data one
+
+
+# Ensure all tests run with "split attrs" turned on.
+@pytest.fixture(scope="module", autouse=True)
+def enable_split_attrs():
+    with iris.FUTURE.context(save_split_attrs=True):
+        yield
+
+
+def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
+    bbytes = [text.encode(encoding) for text in string_array_1d]
+    pad = b"\0" * maxlen
+    bbytes = [(x + pad)[:maxlen] for x in bbytes]
+    chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
+    return chararray
+
+
+def convert_bytesarray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
+INCLUDE_COORD = True
+# INCLUDE_COORD = False
+
+INCLUDE_NUMERIC_AUXCOORD = True
+# INCLUDE_NUMERIC_AUXCOORD = False
+
+
+def make_testfile(filepath, chararray, coordarray, encoding_str=None):
+    ds = _thread_safe_nc.DatasetWrapper(filepath, "w")
+    try:
+        ds.createDimension("x", NX)
+        ds.createDimension("nstr", N_STRLEN)
+        vx = ds.createVariable("x", int, dimensions=("x"))
+        vx[:] = np.arange(NX)
+        if INCLUDE_COORD:
+            ds.createDimension("nstr2", N_STRLEN)
+            v_co = ds.createVariable(
+                "v_co",
+                "S1",
+                dimensions=(
+                    "x",
+                    "nstr2",
+                ),
+            )
+            v_co[:] = coordarray
+            if encoding_str is not None:
+                v_co._Encoding = encoding_str
+            if INCLUDE_NUMERIC_AUXCOORD:
+                v_num = ds.createVariable(
+                    "v_num",
+                    float,
+                    dimensions=("x",),
+                )
+                v_num[:] = np.arange(NX)
+        v = ds.createVariable(
+            "v",
+            "S1",
+            dimensions=(
+                "x",
+                "nstr",
+            ),
+        )
+        v[:] = chararray
+        if encoding_str is not None:
+            v._Encoding = encoding_str
+        if INCLUDE_COORD:
+            coords_str = "v_co"
+            if INCLUDE_NUMERIC_AUXCOORD:
+                coords_str += " v_num"
+            v.coordinates = coords_str
+    finally:
+        ds.close()
+
+
+def make_testcube(
+    dataarray,
+    coordarray,  # for now, these are always *string* arrays
+    encoding_str: str | None = None,
+):
+    cube = Cube(dataarray, var_name="v")
+    cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0)
+    if encoding_str is not None:
+        cube.attributes["_Encoding"] = encoding_str
+    if INCLUDE_COORD:
+        co_x = AuxCoord(coordarray, var_name="v_co")
+        if encoding_str is not None:
+            co_x.attributes["_Encoding"] = encoding_str
+        cube.add_aux_coord(co_x, 0)
+    return cube
+
+
+NCDUMP_PATHSTR = str(env_bin_path("ncdump"))
+
+
+def ncdump(nc_path: str, *args):
+    """Call ncdump to print a dump of a file."""
+    call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
+    subprocess.run(call_args, check=True)
+
+
+def show_result(filepath):
+    print(f"File {filepath}")
+    print("NCDUMP:")
+    ncdump(filepath)
+    # with nc.Dataset(filepath, "r") as ds:
+    #     v = ds.variables["v"]
+    #     print("\n----\nNetcdf data readback (basic)")
+    #     try:
+    #         print(repr(v[:]))
+    #     except UnicodeDecodeError as err:
+    #         print(repr(err))
+    #     print("..raw:")
+    #     v.set_auto_chartostring(False)
+    #     print(repr(v[:]))
+    print("\nAs iris cube..")
+    try:
+        iris.loading.LOAD_PROBLEMS.reset()
+        cube = iris.load_cube(filepath)
+        print(cube)
+        if iris.loading.LOAD_PROBLEMS.problems:
+            print(iris.loading.LOAD_PROBLEMS)
+            print(
+                "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format())
+            )
+        print("-data-")
+        print(repr(cube.data))
+        print("-numeric auxcoord data-")
+        print(repr(cube.coord("x").points))
+        if INCLUDE_COORD:
+            print("-string auxcoord data-")
+            try:
+                print(repr(cube.coord("v_co").points))
+            except Exception as err2:
+                print(repr(err2))
+    except UnicodeDecodeError as err:
+        print(repr(err))
+
+
+@pytest.fixture(scope="session")
+def save_dir(tmp_path_factory):
+    return tmp_path_factory.mktemp("save_files")
+
+
+# TODO: the tests don't test things properly yet, they just exercise the code and print
+#  things for manual debugging.
+tsts = (
+    None,
+    "ascii",
+    "utf-8",
+    "utf-32",
+)
+# tsts = ("utf-8",)
+# tsts = ("utf-8", "utf-32",)
+# tsts = ("utf-32",)
+# tsts = ("utf-8", "ascii", "utf-8")
+
+
+@pytest.mark.parametrize("encoding", tsts)
+def test_load_encodings(encoding, save_dir):
+    # small change
+    print(f"\n=========\nTesting encoding: {encoding}")
+    filepath = save_dir / f"tmp_load_{str(encoding)}.nc"
+    do_as = encoding
+    if encoding != "utf-32":
+        do_as = "utf-8"
+    TEST_CHARARRAY = convert_strings_to_chararray(
+        TEST_STRINGS, N_STRLEN, encoding=do_as
+    )
+    TEST_COORDARRAY = convert_strings_to_chararray(
+        TEST_COORD_VALS, N_STRLEN, encoding=do_as
+    )
+    make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
+    show_result(filepath)
+
+
+@pytest.mark.parametrize("encoding", tsts)
+def test_save_encodings(encoding, save_dir):
+    cube = make_testcube(
+        dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding
+    )
+    print(cube)
+    filepath = save_dir / f"tmp_save_{str(encoding)}.nc"
+    if encoding == "ascii":
+        with pytest.raises(
+            UnicodeEncodeError,
+            match="'ascii' codec can't encode character.*not in range",
+        ):
+            iris.save(cube, filepath)
+    else:
+        iris.save(cube, filepath)
+        show_result(filepath)
diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
new file mode 100644
index 0000000000..bab04aa0c4
--- /dev/null
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -0,0 +1,18 @@
+
+forms in files:
+    * char chardata(dim1, dim2, strlen_xx);    # char data
+    * string data(dim1, dim2);
+
+forms in numpy:
+    * np.ndarray(dtype="S1")  # char data
+    * np.ndarray(dtype="Snn")  # char data
+    * np.ndarray(dtype="Unn")  # strings
+    * np.ndarray(dtype="")
+
+possibilities in createVariable:
+"""
+    The datatype can be a numpy datatype object, or a string that describes a numpy dtype object ...
+    datatype can also be a CompoundType instance (for a structured, or compound array), a VLType instance (for a variable-length array),
+**  or the python str builtin (for a variable-length string array).
+**  Numpy string and unicode datatypes with length greater than one are aliases for str.
+"""
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
new file mode 100644
index 0000000000..8b449c5912
--- /dev/null
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -0,0 +1,14 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module."""
+
+# import numpy as np
+# import pytest
+#
+# from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
+
+
+class TestEncodedDataset:
+    """Test how GRIB_PARAM attributes convert to strings for storage in netcdf files."""

From 0148f437f1872cdda9f3d41ccb37291ce4cba893 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 3 Dec 2025 18:59:43 +0000
Subject: [PATCH 15/43] Various notes, choices + changes: Beginnings of
 encoded-dataset testing.

---
 .../netcdf/_bytecoding_datasets.py            | 155 ++++++++----
 .../integration/netcdf/test_chararrays.py     |   7 +-
 .../fileformats/netcdf/encoding_tests.txt     | 164 +++++++++++++
 .../netcdf/test_bytecoding_datasets.py        | 223 +++++++++++++++++-
 .../unit/fileformats/netcdf/test_nc_dtypes.py |  96 ++++++++
 5 files changed, 595 insertions(+), 50 deletions(-)
 create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 41e801d103..353f14d538 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -41,6 +41,8 @@
 """
 
 import codecs
+import contextlib
+import threading
 import warnings
 
 import numpy as np
@@ -49,17 +51,18 @@
 
 
 def decode_bytesarray_to_stringarray(
-    byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None
+    byte_array: np.ndarray, encoding: str, string_width: int
 ) -> np.ndarray:
     """Convert an array of bytes to an array of strings, with one less dimension.
 
     N.B. for now at least, we assume the string dim is **always the last one**.
     If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
     """
+    if np.ma.isMaskedArray(byte_array):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        byte_array = byte_array.data
     bytes_shape = byte_array.shape
     var_shape = bytes_shape[:-1]
-    if string_width is None:
-        string_width = bytes_shape[-1]
     string_dtype = f"U{string_width}"
     result = np.empty(var_shape, dtype=string_dtype)
     for ndindex in np.ndindex(var_shape):
@@ -70,16 +73,25 @@ def decode_bytesarray_to_stringarray(
     return result
 
 
-def encode_stringarray_as_bytearray(
+#
+# TODO: remove?
+# this older version is "overly flexible", less efficient and not needed here.
+#
+def flexi_encode_stringarray_as_bytearray(
     data: np.ndarray, encoding=None, string_dimension_length: int | None = None
 ) -> np.ndarray:
     """Encode strings as bytearray.
 
     Note: if 'string_dimension_length' is not given (None), it is set to the longest
-    encoded bytes element.  If 'string_dimension_length' is specified, the last array
+    encoded bytes element, **OR** the dtype size, if that is greater.
+    If 'string_dimension_length' is specified, the last array
     dimension is set to this and content strings are truncated or extended as required.
     """
+    if np.ma.isMaskedArray(data):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        data = data.data
     element_shape = data.shape
+    # Encode all the strings + see which is longest
     max_length = 1  # this is a MINIMUM - i.e. not zero!
     data_elements = np.zeros(element_shape, dtype=object)
     for index in np.ndindex(element_shape):
@@ -90,10 +102,15 @@ def encode_stringarray_as_bytearray(
             max_length = element_length
 
     if string_dimension_length is None:
+        # If the string length was not specified, it is the maximum encoded length
+        # (n-bytes), **or** the dtype string-length, if greater.
         string_dimension_length = max_length
+        array_string_length = int(str(data.dtype)[2:])  # Yuck. No better public way?
+        if array_string_length > string_dimension_length:
+            string_dimension_length = array_string_length
 
-    # We already encoded all the strings, but stored them in an object-array as
-    #  we didn't yet know the fixed byte-length to convert to.
+    # We maybe *already* encoded all the strings above, but stored them in an
+    #  object-array as we didn't yet know the fixed byte-length to convert to.
     # Now convert to a fixed-width byte array with an extra string-length dimension
     result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
     right_pad = b"\0" * string_dimension_length
@@ -105,58 +122,98 @@ def encode_stringarray_as_bytearray(
     return result
 
 
-DEFAULT_ENCODING = "utf-8"
+def encode_stringarray_as_bytearray(
+    data: np.ndarray, encoding: str, string_dimension_length: int
+) -> np.ndarray:
+    """Encode strings as a bytes array."""
+    element_shape = data.shape
+    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+    right_pad = b"\0" * string_dimension_length
+    for index in np.ndindex(element_shape):
+        bytes = data[index].encode(encoding=encoding)
+        # It's all a bit nasty ...
+        bytes = (bytes + right_pad)[:string_dimension_length]
+        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
+
+    return result
+
+
+class NetcdfStringDecodeSetting(threading.local):
+    def __init__(self, perform_encoding: bool = True):
+        self.set(perform_encoding)
+
+    def set(self, perform_encoding: bool):
+        self.perform_encoding = perform_encoding
+
+    def __bool__(self):
+        return self.perform_encoding
+
+    @contextlib.contextmanager
+    def context(self, perform_encoding: bool):
+        old_setting = self.perform_encoding
+        self.perform_encoding = perform_encoding
+        yield
+        self.perform_encoding = old_setting
+
+
+DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting()
+DEFAULT_READ_ENCODING = "utf-8"
+DEFAULT_WRITE_ENCODING = "ascii"
 
 
 class EncodedVariable(VariableWrapper):
     """A variable wrapper that translates variable data according to byte encodings."""
 
     def __getitem__(self, keys):
-        if self.is_chardata():
-            super().set_auto_chartostring(False)
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
 
         data = super().__getitem__(keys)
 
-        if self.is_chardata():
-            encoding = self.get_byte_encoding()
-            strlen = self.get_string_length()
-            data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+        if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
+            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+            # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
+            strlen = self._get_string_length()
+            try:
+                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+            except UnicodeDecodeError as err:
+                msg = (
+                    f"Character data in variable {self.name!r} could not be decoded"
+                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
+                    "variable '_Encoding' attribute to suit the content."
+                )
+                raise ValueError(msg) from err
 
         return data
 
     def __setitem__(self, keys, data):
-        if self.is_chardata():
-            encoding = self.get_byte_encoding()
-            strlen = self.get_string_length()
-            if encoding is not None:
-                data = encode_stringarray_as_bytearray(data, encoding, strlen)
-            else:
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
+
+            encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
+            # N.B. typically, write encoding default is "ascii" --> fails bad content
+            if data.dtype.kind == "U":
                 try:
-                    # Check if all characters are valid ascii
-                    data = encode_stringarray_as_bytearray(data, "ascii", strlen)
-                except UnicodeEncodeError:
-                    data = encode_stringarray_as_bytearray(
-                        data, DEFAULT_ENCODING, strlen
-                    )
-                    # As this was necessary, record the new encoding on the variable
-                    self.set_ncattr("_Encoding", DEFAULT_ENCODING)
+                    strlen = self._get_string_length()
+                    data = encode_stringarray_as_bytearray(data, encoding, strlen)
+                except UnicodeEncodeError as err:
                     msg = (
-                        f"Non-ascii data written to label variable {self.name}. "
-                        f"Applied {DEFAULT_ENCODING!r} encoding, "
-                        f"and set attribute _Encoding={DEFAULT_ENCODING!r}."
+                        f"String data written to netcdf character variable {self.name!r} "
+                        f"could not be represented in encoding {encoding!r}.  This can be "
+                        "fixed by setting a suitable variable '_Encoding' attribute, "
+                        'e.g. <variable>._Encoding="UTF-8".'
                     )
-                    warnings.warn(msg, UserWarning)
-
-            super().set_auto_chartostring(False)
+                    raise ValueError(msg) from err
 
         super().__setitem__(keys, data)
 
-    def is_chardata(self):
+    def _is_chardata(self):
         return np.issubdtype(self.dtype, np.bytes_)
 
-    def get_encoding(self) -> str | None:
-        """Get the effective byte encoding to be used for this variable."""
-        # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+    def _get_encoding(self) -> str | None:
+        """Get the byte encoding defined for this variable (or None)."""
         result = getattr(self, "_Encoding", None)
         if result is not None:
             try:
@@ -165,18 +222,32 @@ def get_encoding(self) -> str | None:
                 # NOTE: if encoding does not suit data, errors can occur.
                 # For example, _Encoding = "ascii", with non-ascii content.
             except LookupError:
-                # Replace some invalid setting with "safe"(ish) fallback.
+                # Unrecognised encoding name : handle this as just a warning
                 msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
                 warnings.warn(msg, UserWarning)
 
         return result
 
-    def get_string_length(self):
-        """Return the string-length defined for this variable (or None)."""
-        return getattr(self, "iris_string_length", None)
+    def _get_string_length(self):
+        """Return the string-length defined for this variable."""
+        if not hasattr(self, "_strlen"):
+            # Work out the string length from the parent dataset dimensions.
+            strlen = self.group().dimensions[self.dimensions[-1]].size
+            # Cache this on the variable -- but not as a netcdf attribute (!)
+            self.__dict__["_strlen"] = strlen
+
+        return self._strlen
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
+        raise TypeError(msg)
 
 
 class EncodedDataset(DatasetWrapper):
     """A specialised DatasetWrapper whose variables perform byte encoding."""
 
     VAR_WRAPPER_CLS = EncodedVariable
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
+        raise TypeError(msg)
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index 0eb211c8b0..4414444733 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -137,8 +137,11 @@ def make_testcube(
 
 def ncdump(nc_path: str, *args):
     """Call ncdump to print a dump of a file."""
-    call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
-    subprocess.run(call_args, check=True)
+    call_args = [NCDUMP_PATHSTR, nc_path] + list(args)
+    bytes = subprocess.check_output(call_args)
+    text = bytes.decode("utf-8")
+    print(text)
+    return text
 
 
 def show_result(filepath):
diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
index bab04aa0c4..e77427cd63 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -1,8 +1,95 @@
+===========
+Outstanding Qs
+* What would we like to do with all this IN IRIS??
+    - generally present as string arrays (Uxx)
+    - existing scheme of naming dims for length + re-using is quite cunning!
+    - choice of seeing actual character arrays as alternative to string conversions?
+
+* string length handling for load/save/roundtrip
+  - on SAVE, we need some control so we can create files which are compatible,
+    irrespective of the data (which currently we are not doing)
+    - ALSO this is wanted to ensure that multiple vars (e.g. string cubes or string coords)
+      will share the string dim -- instead of creating arbitrary different ones
+    - presumably, if encoding blows the max-len, we must get a warning/error
+
+  - on LOAD, we may want to *capture* the actual original string dim length, so it can be
+    re-created on save (by some scheme, as per previous) -- i.e. enable roundtripping.
+    I don't really want to preserve the name of the string dim, but this could be a
+    slightly tender point.  To consider also : the impact of this on the non-equivalence
+    of loaded cubes, if we use actual *attributes* to carry this info (see below).
+    - **if not** : just load data + convert to string arrays as seems best
+        - this will also lead to incompatible cubes.
+
+  - on SAVE, in the absence of strlen-controls, what is a reasonable default choice?
+     - take longest encoded
+     - set nbytes = NEXPAND(encoding) * nchars
+        - sensible values would depend on the encoding...
+            : ascii -> 1
+            : utf-8 -> 1 or 4 ???
+            : utf-16 -> 2 or 4 ???
+            : utf-32 -> 4
+
+  - on LOAD, in absence of strlen controls, how do we choose the result DTYPE (i.e. character length)?
+    - again, may depend on the encoding:
+        : ascii = "U<strlen>"
+        : UTF-8 = "U<strlen>"
+        : UTF-16 = "U<strlen/2>"
+        : UTF-32 = "U<strlen/4>"
+            - N.B. these are ll at least "safe" - i.e. won't lose characters
+
+
+separately from these, there is the question of how the controls affect "normal"
+cube operations.
+    - the easiest approach is to define a "special" attribute,
+      which can be set on any cube/component
+    - using the dtype-length of the data would be *possible*, in conjunction with the
+      above-proposed "default rules" for choosing strlen from the dtype.
+      But this might not round-trip in all cases.
+
+within the actual data arrays
+    - we can't really expect any different to what numpy does
+        - that is, the dtype-length of any element <= that of the array  (and not ==)
+          this may be tricky, but we can't easily prevent it.
+                >>> a = np.array(['', 'a', 'bb'])
+                >>> a
+                array(['', 'a', 'bb'], dtype='<U2')
+                >>> a[0].dtype
+                dtype('<U')
+                >>> a[1].dtype
+                dtype('<U1')
+                >>> a[2].dtype
+                dtype('<U2')
+                >>> a.dtype
+                dtype('<U2')
+                >>>
+    - likewise, we can't assign without possible truncation.
+      If you **want** to expand the supported width, can use ".astype()" first ?
+
+
+========================
+=========================
 
 forms in files:
     * char chardata(dim1, dim2, strlen_xx);    # char data
     * string data(dim1, dim2);
 
+netcdf types:
+(netcdf docs terms)
+    NC_BYTE 8-bit signed integer
+    NC_UBYTE 8-bit unsigned integer
+    NC_CHAR 8-bit character
+    NC_STRING variable length character string
+
+***NOTE*** there is no NC_UCHAR or "unsigned char" type
+
+
+relevant numpy base types (scalar dtypes):
+    * "S" bytes             : np.bytes_ == np.int8
+    * "B" unsigned bytes    : np.ubyte == np.uint8
+    * 'i' ints              : np.int_
+    * 'u' unsigned ints     : np.int_
+    * "U" unicode string    : np.str_
+
 forms in numpy:
     * np.ndarray(dtype="S1")  # char data
     * np.ndarray(dtype="Snn")  # char data
@@ -16,3 +103,80 @@ possibilities in createVariable:
 **  or the python str builtin (for a variable-length string array).
 **  Numpy string and unicode datatypes with length greater than one are aliases for str.
 """
+
+test types:
+    "i1" : np.int8
+    "u1" : np.uint8
+    "S1" : np.byte_
+    "U1" : np.str_
+    "S<n>" :
+    "U<n>" : with/without non-ascii content
+
+save all these to files...
+outputs from "test_nc_dtypes.py" test run:
+  SPEC:i1 SAVED-AS:int8     byte    RELOAD-AS:int8
+  SPEC:u1 SAVED-AS:uint8    ubyte   RELOAD-AS:uint8
+  SPEC:S1 SAVED-AS:|S1      char    RELOAD-AS:<U3
+         **OR*** |S1, if set_auto_chartostring(False)
+         - in which case, dimensions also different : (3,) --> ()
+  SPEC:U1 SAVED-AS:<U1      string  RELOAD-AS:object
+  SPEC:S SAVED-AS:|S5       string  RELOAD-AS:object
+  SPEC:U SAVED-AS:<U6       string  RELOAD-AS:object
+
+
+
+What is relevant/possible :
+* IN netcdf files
+    - variables of type "char"
+
+... investigate the uchar thing...
+  - confirmed there is no such thing
+  - (see commented-out portions of  "test_nc_dtypes.py" -- around "test_uchar")
+
+
+* IN netcdf4-python
+    - reading: variables of type "char" can pre presented as EITHER "S1" OR object (=strings)
+    - writing: likewise, but the format switch is "automatic" ??
+        - i.e. you can pass EITHER "arr(dims + strlen):S1" OR "arr(dims):Unn"
+
+Then, as regards the _Encoding ..
+    - Reading: converts if required, use _Encoding or "UTF-8" (== safe)
+    - Writing: converts if required, use  _Encoding or "ascii" (== fail if unsuited)
+
+
+TO TEST...
+==========
+create a dataset + write char data
+  - X assign different encodings: makes no difference
+
+create a dataset + write STRING data
+  - X encoding=(ascii, utf-8, utf-32, None)
+  - X withnonascii=(T, F)
+  - X length=(long, short, none)
+
+read string data
+    - X encoding=(ascii, utf-8, utf-32, None)
+    - X withnonascii=(T, F)
+
+read char data (with control)
+  - X different encodings: make no difference
+
+==rethought==
+write strings
+    - scalar
+    - 1D
+    - multidm
+    - X encodings
+    - check encoding failures + defaults
+    - check length controls + truncations
+
+read strings
+    - X encodings
+    - decoding failures + defaults
+
+write char data
+    - X encodings: don't matter
+
+read char data
+    - X encodings: don't matter
+
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 8b449c5912..092da19a00 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -4,11 +4,222 @@
 # See LICENSE in the root of the repository for full licensing details.
 """Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module."""
 
-# import numpy as np
-# import pytest
-#
-# from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from iris.fileformats.netcdf._bytecoding_datasets import (
+    EncodedDataset,
+    encode_stringarray_as_bytearray,
+    flexi_encode_stringarray_as_bytearray,
+)
+from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
+
+encoding_options = [None, "ascii", "utf-8", "utf-32"]
+
+samples_3_ascii = np.array(
+    ["one", "", "seven"],  # N.B. include empty!
+)
+samples_3_nonascii = np.array(["two", "", "épéé"])
+
+
+def strings_maxbytes(strings, encoding):
+    return max(len(string.encode(encoding)) for string in strings)
+
+
+@pytest.fixture(params=encoding_options)
+def encoding(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def tempdir(tmp_path_factory):
+    path = tmp_path_factory.mktemp("netcdf")
+    return path
+
+
+def make_encoded_dataset(
+    path: Path, strlen: int, encoding: str | None = None
+) -> EncodedDataset:
+    """Create a test EncodedDataset linked to an actual file.
+
+    * strlen becomes the string dimension (i.e. a number of *bytes*)
+    * a variable "vxs" is created
+    * If 'encoding' is given, the "vxs::_Encoding" attribute is created with this value
+    """
+    ds = EncodedDataset(path, "w")
+    ds.createDimension("x", 3)
+    ds.createDimension("strlen", strlen)
+    v = ds.createVariable("vxs", "S1", ("x", "strlen"))
+    if encoding is not None:
+        v.setncattr("_Encoding", encoding)
+    return ds
+
+
+def fetch_undecoded_var(path, varname):
+    # Open a path as a "normal" dataset, and return a given variable.
+    ds_normal = DatasetWrapper(path)
+    ds_normal._contained_instance.set_auto_chartostring(False)
+    v = ds_normal.variables[varname]
+    # Return a variable, rather than its data, so we can check attributes etc.
+    return v
+
+
+class TestWriteStrings:
+    """Test how string data is saved to a file."""
+
+    def test_write_strings(self, encoding, tempdir):
+        # Create a dataset with the variable
+        path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc"
+
+        if encoding in [None, "ascii"]:
+            writedata = samples_3_ascii
+            write_encoding = "ascii"
+        else:
+            writedata = samples_3_nonascii
+            write_encoding = encoding
+
+        writedata = writedata.copy()  # just for safety?
+        strlen = strings_maxbytes(writedata, write_encoding)
+
+        ds_encoded = make_encoded_dataset(path, strlen, encoding)
+        v = ds_encoded.variables["vxs"]
+
+        # Effectively, checks that we *can* write strings
+        v[:] = writedata
+
+        # Close, re-open as an "ordinary" dataset, and check the raw content.
+        ds_encoded.close()
+        v = fetch_undecoded_var(path, "vxs")
+
+        # Check that the raw result is as expected
+        bytes_result = v[:]
+        expected = encode_stringarray_as_bytearray(writedata, write_encoding, strlen)
+        assert (
+            bytes_result.shape == expected.shape
+            and bytes_result.dtype == expected.dtype
+            and np.all(bytes_result == expected)
+        )
+
+        # Check that the "_Encoding" property is also as expected
+        result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None
+        assert result_attr == encoding
+
+    def test_scalar(self, tempdir):
+        # Like 'test_write_strings', but the variable has *only* the string dimension.
+        path = tempdir / "test_writestrings_scalar.nc"
+
+        ds_encoded = make_encoded_dataset(path, strlen=5)
+        v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",))
+
+        # Checks that we *can* write a string
+        v[:] = np.array("stuff", dtype=str)
+
+        # Close, re-open as an "ordinary" dataset, and check the raw content.
+        ds_encoded.close()
+        v = fetch_undecoded_var(path, "v0_scalar")
+        result = v[:]
+
+        # Check that the raw result is as expected
+        assert (
+            result.shape == (5,)
+            and result.dtype == "<S1"
+            and np.all(result == [b"s", b"t", b"u", b"f", b"f"])
+        )
+
+    def test_multidim(self, tempdir):
+        # Like 'test_write_strings', but the variable has additional dimensions.
+        path = tempdir / "test_writestrings_multidim.nc"
+
+        ds_encoded = make_encoded_dataset(path, strlen=5)
+        ds_encoded.createDimension("y", 2)
+        v = ds_encoded.createVariable(
+            "vyxn",
+            "S1",
+            (
+                "y",
+                "x",
+                "strlen",
+            ),
+        )
+
+        # Check that we *can* write a multidimensional string array
+        test_data = np.array(
+            [
+                ["one", "n", ""],
+                ["two", "xxxxx", "four"],
+            ],
+            dtype="U5",
+        )
+        v[:] = test_data
+
+        # Close, re-open as an "ordinary" dataset, and check the raw content.
+        ds_encoded.close()
+        v = fetch_undecoded_var(path, "vyxn")
+        result = v[:]
+
+        # Check that the raw result is as expected
+        expected_bytes = encode_stringarray_as_bytearray(
+            test_data, encoding="ascii", string_dimension_length=5
+        )
+        assert (
+            result.shape
+            == (
+                2,
+                3,
+                5,
+            )
+            and result.dtype == "<S1"
+            and np.all(result == expected_bytes)
+        )
+
+    def test_write_encoding_failure(self, tempdir):
+        path = tempdir / "test_writestrings_encoding_failure.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding="ascii")
+        v = ds.variables["vxs"]
+        msg = (
+            "String data written to netcdf character variable 'vxs'.*"
+            " could not be represented in encoding 'ascii'. "
+        )
+        with pytest.raises(ValueError, match=msg):
+            v[:] = samples_3_nonascii
+
+    def test_overlength_warning(self):
+        pass
+
+
+class TestWriteChars:
+    @pytest.mark.parametrize("write_form", ["strings", "bytes"])
+    def test_write_chars(self, tempdir, write_form):
+        encoding = "utf-8"
+        write_strings = samples_3_nonascii
+        write_bytes = flexi_encode_stringarray_as_bytearray(
+            write_strings, encoding=encoding
+        )
+        # NOTE: 'flexi' form util decides the width needs to be 7 !!
+        strlen = write_bytes.shape[-1]
+        path = tempdir / f"test_writechars_{write_form}.nc"
+        ds = make_encoded_dataset(path, encoding=encoding, strlen=strlen)
+        v = ds.variables["vxs"]
+
+        # assign in *either* way..
+        if write_form == "strings":
+            v[:] = write_strings
+        else:
+            v[:] = write_bytes
+
+        # .. the result should be the same
+        result = v[:]
+        assert (
+            result.shape == write_strings.shape
+            and result.dtype == f"<U{strlen}"  # NOTE: we fixed the string width
+            and np.all(result == write_strings)
+        )
+
 
+class TestReadStrings:
+    """Test how character data is read and converted to strings."""
 
-class TestEncodedDataset:
-    """Test how GRIB_PARAM attributes convert to strings for storage in netcdf files."""
+    def test_encodings(self, encoding):
+        pass
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py
new file mode 100644
index 0000000000..0c5d2b279e
--- /dev/null
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py
@@ -0,0 +1,96 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Temporary code to confirm how various numpy dtypes are stored in a netcdf file."""
+
+import netCDF4 as nc
+import numpy as np
+import pytest
+
+from iris.tests.integration.netcdf.test_chararrays import ncdump
+
+# types = [
+#     "i1",  # np.int8
+#     "u1",  # np.uint8
+#     "S1",  # np.byte_
+#     "U1",  # np.str_
+#     "S",  # multibytes
+#     "U",  # unicode strings, with/without non-ascii content
+# ]
+
+samples = {
+    "i1": [-5, 7, 35],  # np.int8
+    "u1": [65, 67, 90],  # np.uint8
+    "S1": [b"A", b"B", b"Z"],  # np.byte_
+    "U1": ["A", "B", "C"],  # np.str_
+    "S": [b"one21", b"three", b""],  # multibyte
+    "U": ["one", "éclair", "nine"],  # unicode strings
+}
+sample_arrays = {
+    type_code: np.array(values, dtype=type_code)
+    for type_code, values in samples.items()
+}
+
+
+@pytest.fixture(scope="module")
+def tmpdir(tmp_path_factory):
+    return tmp_path_factory.mktemp("netcdf")
+
+
+def create_file(array: np.ndarray, path):
+    with nc.Dataset(str(path), "w") as ds:
+        ds.createDimension("x", 3)
+        v = ds.createVariable("vx", array.dtype, ("x",))
+        # v.set_auto_chartostring(False)
+        v._Encoding = "UTF-8" if array.dtype.kind == "U" else "ascii"
+        v[:] = array
+
+
+def get_loadback_array(path):
+    with nc.Dataset(str(path), "r") as ds:
+        v = ds.variables["vx"]
+        v.set_auto_chartostring(False)
+        result = v[:]
+    return result
+
+
+@pytest.mark.parametrize("dtype", list(samples.keys()))
+def test(tmpdir, dtype):
+    arr = sample_arrays[dtype]
+    print("\n---")
+    print(dtype)
+    path = tmpdir / f"tmp_{dtype}.nc"
+    create_file(arr, path)
+    ncdump(path, "-s")
+    loadback_array = get_loadback_array(path)
+    print(f"  SPEC:{dtype} SAVED-AS:{arr.dtype} RELOAD-AS:{loadback_array.dtype}")
+
+
+# from iris.tests import env_bin_path
+# NCGEN_PATHSTR = str(env_bin_path("ncgen"))
+#
+#
+# def ncgen(cdl_path, nc_path, *args):
+#     """Call ncdump to print a dump of a file."""
+#     args = list(args)
+#     if not any(arg.startswith('-k') for arg in args):
+#         args[:0] = ["-k", "nc4"]  # force netcdf4
+#     call_args = [NCGEN_PATHSTR] + list(args) + [str(cdl_path), '-o', str(nc_path)]
+#     subprocess.check_call(call_args)
+#
+#
+# def test_uchar(tmpdir):
+#     arr = sample_arrays["S1"]
+#     path = tmpdir / f"tmp_ichar.nc"
+#     create_file(arr, path)
+#     text = ncdump(path, "-s")
+#     text_u = text.replace("\t", "   ")
+#     text_u = text_u.replace(" char ", " unsigned char ")
+#     cdl_path = tmpdir / f"tmp_uchar.cdl"
+#     with open(cdl_path, "w") as f_out:
+#         f_out.write(text_u)
+#     nc_path_2 = tmpdir / f"tmp_uchar.nc"
+#     ncgen(cdl_path, nc_path_2)
+#     loadback_array = get_loadback_array(nc_path_2)
+#     print(f"  netcdf type 'uchar' LOADS-AS:{loadback_array.dtype}")

From 20a5be2520b3d4a7a91d50ca8943429bb8a0f474 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 12:51:04 +0000
Subject: [PATCH 16/43] Replace use of encoding functions with test-specific
 function: Test for overlength writes.

---
 .../netcdf/_bytecoding_datasets.py            |   6 +-
 .../fileformats/netcdf/encoding_tests.txt     |  15 +-
 .../netcdf/test_bytecoding_datasets.py        | 194 ++++++++++++------
 3 files changed, 147 insertions(+), 68 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 353f14d538..62e1dd2ab7 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -123,9 +123,10 @@ def flexi_encode_stringarray_as_bytearray(
 
 
 def encode_stringarray_as_bytearray(
-    data: np.ndarray, encoding: str, string_dimension_length: int
+    data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
 ) -> np.ndarray:
     """Encode strings as a bytes array."""
+    data = np.asanyarray(data)
     element_shape = data.shape
     result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
     right_pad = b"\0" * string_dimension_length
@@ -179,7 +180,7 @@ def __getitem__(self, keys):
                 data = decode_bytesarray_to_stringarray(data, encoding, strlen)
             except UnicodeDecodeError as err:
                 msg = (
-                    f"Character data in variable {self.name!r} could not be decoded"
+                    f"Character data in variable {self.name!r} could not be decoded "
                     f"with the {encoding!r} encoding.  This can be fixed by setting the "
                     "variable '_Encoding' attribute to suit the content."
                 )
@@ -188,6 +189,7 @@ def __getitem__(self, keys):
         return data
 
     def __setitem__(self, keys, data):
+        data = np.asanyarray(data)
         if self._is_chardata():
             # N.B. we never need to UNset this, as we totally control it
             self._contained_instance.set_auto_chartostring(False)
diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
index e77427cd63..5fa021ccdd 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -146,12 +146,17 @@ Then, as regards the _Encoding ..
 
 TO TEST...
 ==========
-create a dataset + write char data
-  - X assign different encodings: makes no difference
+NOTE on length control:
+    - not an API thing, it's implicit from when you create a variable
+    - this also applies to how it loads back
+        - BUT here there may be scope for a control attribute :
 
-create a dataset + write STRING data
-  - X encoding=(ascii, utf-8, utf-32, None)
-  - X withnonascii=(T, F)
++++ create a dataset + write char data
++++   - X assign different encodings: makes no difference
+
++++ create a dataset + write STRING data
++++   - X encoding=(ascii, utf-8, utf-32, None)
++++   - X withnonascii=(T, F)
   - X length=(long, short, none)
 
 read string data
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 092da19a00..411212b973 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -9,11 +9,7 @@
 import numpy as np
 import pytest
 
-from iris.fileformats.netcdf._bytecoding_datasets import (
-    EncodedDataset,
-    encode_stringarray_as_bytearray,
-    flexi_encode_stringarray_as_bytearray,
-)
+from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
 from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
 
 encoding_options = [None, "ascii", "utf-8", "utf-32"]
@@ -66,8 +62,92 @@ def fetch_undecoded_var(path, varname):
     return v
 
 
+def check_raw_content(path, varname, expected_byte_array):
+    v = fetch_undecoded_var(path, varname)
+    bytes_result = v[:]
+    assert (
+        bytes_result.shape == expected_byte_array.shape
+        and bytes_result.dtype == expected_byte_array.dtype
+        and np.all(bytes_result == expected_byte_array)
+    )
+
+
+def _make_bytearray_inner(data, encoding):
+    # Convert to a (list of [lists of..]) strings or bytes to a
+    #  (list of [lists of..]) length-1 bytes with an extra dimension.
+    if isinstance(data, str):
+        # Convert input strings to bytes
+        data = data.encode(encoding)
+    if isinstance(data, bytes):
+        # iterate over bytes to get a sequence of length-1 bytes (what np.array wants)
+        result = [data[i : i + 1] for i in range(len(data))]
+    else:
+        # If not string/bytes, expect the input to be a list.
+        # N.B. the recursion is inefficient, but we don't care about that here
+        result = [_make_bytearray_inner(part, encoding) for part in data]
+    return result
+
+
+def make_bytearray(data, encoding="ascii"):
+    """Convert bytes or lists of bytes into a numpy byte array.
+
+    This is largely to avoid using "encode_stringarray_as_bytearray", since we don't
+    want to depend on that when we should be testing it.
+    So, it mostly replicates the function of that, but it does also support bytes in the
+    input, and it automatically finds + applies the maximum bytes-lengths in the input.
+    """
+    # First, Convert to a (list of [lists of]..) length-1 bytes objects
+    data = _make_bytearray_inner(data, encoding)
+
+    # Numbers of bytes in the inner dimension are the lengths of bytes/strings input,
+    #  so they aren't all the same.
+    # To enable array conversion, we fix that by expanding all to the max length
+
+    def get_maxlen(data):
+        # Find the maximum number of bytes in the inner dimension.
+        if not isinstance(data, list):
+            # Inner bytes object
+            assert isinstance(data, bytes)
+            longest = len(data)
+        else:
+            # We have a list: either a list of bytes, or a list of lists.
+            if len(data) == 0 or not isinstance(data[0], list):
+                # inner-most list, should contain bytes if anything
+                assert len(data) == 0 or isinstance(data[0], bytes)
+                # return n-bytes
+                longest = len(data)
+            else:
+                # list of lists: return max length of sub-lists
+                longest = max(get_maxlen(part) for part in data)
+        return longest
+
+    maxlen = get_maxlen(data)
+
+    def extend_all_to_maxlen(data, length, filler=b"\0"):
+        # Extend each "innermost" list (of single bytes) to the required length
+        if isinstance(data, list):
+            if len(data) == 0 or not isinstance(data[0], list):
+                # Pad all the inner-most lists to the required number of elements
+                n_extra = length - len(data)
+                if n_extra > 0:
+                    data = data + [filler] * n_extra
+            else:
+                data = [extend_all_to_maxlen(part, length, filler) for part in data]
+        return data
+
+    data = extend_all_to_maxlen(data, maxlen)
+    # We should now be able to create an array of single bytes.
+    result = np.array(data)
+    assert result.dtype == "<S1"
+    return result
+
+
 class TestWriteStrings:
-    """Test how string data is saved to a file."""
+    """Test how string data is saved to a file.
+
+    Mostly, we read back data as a "normal" dataset to avoid relying on the read code,
+    which is separately tested -- see 'TestReadStrings'.
+    """
 
     def test_write_strings(self, encoding, tempdir):
         # Create a dataset with the variable
@@ -91,18 +171,11 @@ def test_write_strings(self, encoding, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        v = fetch_undecoded_var(path, "vxs")
-
-        # Check that the raw result is as expected
-        bytes_result = v[:]
-        expected = encode_stringarray_as_bytearray(writedata, write_encoding, strlen)
-        assert (
-            bytes_result.shape == expected.shape
-            and bytes_result.dtype == expected.dtype
-            and np.all(bytes_result == expected)
-        )
+        expected_bytes = make_bytearray(writedata, write_encoding)
+        check_raw_content(path, "vxs", expected_bytes)
 
-        # Check that the "_Encoding" property is also as expected
+        # Check also that the "_Encoding" property is as expected
+        v = fetch_undecoded_var(path, "vxs")
         result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None
         assert result_attr == encoding
 
@@ -118,15 +191,8 @@ def test_scalar(self, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        v = fetch_undecoded_var(path, "v0_scalar")
-        result = v[:]
-
-        # Check that the raw result is as expected
-        assert (
-            result.shape == (5,)
-            and result.dtype == "<S1"
-            and np.all(result == [b"s", b"t", b"u", b"f", b"f"])
-        )
+        expected_bytes = make_bytearray(b"stuff")
+        check_raw_content(path, "v0_scalar", expected_bytes)
 
     def test_multidim(self, tempdir):
         # Like 'test_write_strings', but the variable has additional dimensions.
@@ -145,34 +211,16 @@ def test_multidim(self, tempdir):
         )
 
         # Check that we *can* write a multidimensional string array
-        test_data = np.array(
-            [
-                ["one", "n", ""],
-                ["two", "xxxxx", "four"],
-            ],
-            dtype="U5",
-        )
+        test_data = [
+            ["one", "n", ""],
+            ["two", "xxxxx", "four"],
+        ]
         v[:] = test_data
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        v = fetch_undecoded_var(path, "vyxn")
-        result = v[:]
-
-        # Check that the raw result is as expected
-        expected_bytes = encode_stringarray_as_bytearray(
-            test_data, encoding="ascii", string_dimension_length=5
-        )
-        assert (
-            result.shape
-            == (
-                2,
-                3,
-                5,
-            )
-            and result.dtype == "<S1"
-            and np.all(result == expected_bytes)
-        )
+        expected_bytes = make_bytearray(test_data)
+        check_raw_content(path, "vyxn", expected_bytes)
 
     def test_write_encoding_failure(self, tempdir):
         path = tempdir / "test_writestrings_encoding_failure.nc"
@@ -185,8 +233,38 @@ def test_write_encoding_failure(self, tempdir):
         with pytest.raises(ValueError, match=msg):
             v[:] = samples_3_nonascii
 
-    def test_overlength_warning(self):
-        pass
+    def test_overlength(self, tempdir):
+        # Check expected behaviour with over-length data
+        path = tempdir / "test_writestrings_overlength.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding="ascii")
+        v = ds.variables["vxs"]
+        v[:] = ["1", "123456789", "two"]
+        expected_bytes = make_bytearray(["1", "12345", "two"])
+        check_raw_content(path, "vxs", expected_bytes)
+
+    def test_overlength_splitcoding(self, tempdir):
+        # Check expected behaviour when non-ascii multibyte coding gets truncated
+        path = tempdir / "test_writestrings_overlength_splitcoding.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding="utf-8")
+        v = ds.variables["vxs"]
+        v[:] = ["1", "1234ü", "two"]
+        # This creates a problem: it won't read back
+        msg = (
+            "Character data in variable 'vxs' could not be decoded "
+            "with the 'utf-8' encoding."
+        )
+        with pytest.raises(ValueError, match=msg):
+            v[:]
+
+        # Check also that we *can* read the raw content.
+        ds.close()
+        expected_bytes = [
+            b"1",
+            b"1234\xc3",  # NOTE: truncated encoding
+            b"two",
+        ]
+        expected_bytearray = make_bytearray(expected_bytes)
+        check_raw_content(path, "vxs", expected_bytearray)
 
 
 class TestWriteChars:
@@ -194,9 +272,7 @@ class TestWriteChars:
     def test_write_chars(self, tempdir, write_form):
         encoding = "utf-8"
         write_strings = samples_3_nonascii
-        write_bytes = flexi_encode_stringarray_as_bytearray(
-            write_strings, encoding=encoding
-        )
+        write_bytes = make_bytearray(write_strings, encoding=encoding)
         # NOTE: 'flexi' form util decides the width needs to be 7 !!
         strlen = write_bytes.shape[-1]
         path = tempdir / f"test_writechars_{write_form}.nc"
@@ -210,12 +286,8 @@ def test_write_chars(self, tempdir, write_form):
             v[:] = write_bytes
 
         # .. the result should be the same
-        result = v[:]
-        assert (
-            result.shape == write_strings.shape
-            and result.dtype == f"<U{strlen}"  # NOTE: we fixed the string width
-            and np.all(result == write_strings)
-        )
+        ds.close()
+        check_raw_content(path, "vxs", write_bytes)
 
 
 class TestReadStrings:

From 9b621bf74ef0cf02cbaae8e0bc8745cf9c0ec25b Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 14:47:54 +0000
Subject: [PATCH 17/43] Radically simplify 'make_bytesarray', by using a known
 specified bytewidth.

---
 .../netcdf/test_bytecoding_datasets.py        | 76 ++++++-------------
 1 file changed, 22 insertions(+), 54 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 411212b973..9ef354f850 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -72,7 +72,7 @@ def check_raw_content(path, varname, expected_byte_array):
     )
 
 
-def _make_bytearray_inner(data, encoding):
+def _make_bytearray_inner(data, bytewidth, encoding):
     # Convert to a (list of [lists of..]) strings or bytes to a
     #  (list of [lists of..]) length-1 bytes with an extra dimension.
     if isinstance(data, str):
@@ -81,61 +81,25 @@ def _make_bytearray_inner(data, encoding):
     if isinstance(data, bytes):
         # iterate over bytes to get a sequence of length-1 bytes (what np.array wants)
         result = [data[i : i + 1] for i in range(len(data))]
+        # pad or truncate everything to the required bytewidth
+        result = (result + [b"\0"] * bytewidth)[:bytewidth]
     else:
         # If not string/bytes, expect the input to be a list.
         # N.B. the recursion is inefficient, but we don't care about that here
-        result = [_make_bytearray_inner(part, encoding) for part in data]
+        result = [_make_bytearray_inner(part, bytewidth, encoding) for part in data]
     return result
 
 
-def make_bytearray(data, encoding="ascii"):
+def make_bytearray(data, bytewidth, encoding="ascii"):
     """Convert bytes or lists of bytes into a numpy byte array.
 
     This is largely to avoid using "encode_stringarray_as_bytearray", since we don't
     want to depend on that when we should be testing it.
     So, it mostly replicates the function of that, but it does also support bytes in the
-    input, and it automatically finds + applies the maximum bytes-lengths in the input.
+    input.
     """
     # First, Convert to a (list of [lists of]..) length-1 bytes objects
-    data = _make_bytearray_inner(data, encoding)
-
-    # Numbers of bytes in the inner dimension are the lengths of bytes/strings input,
-    #  so they aren't all the same.
-    # To enable array conversion, we fix that by expanding all to the max length
-
-    def get_maxlen(data):
-        # Find the maximum number of bytes in the inner dimension.
-        if not isinstance(data, list):
-            # Inner bytes object
-            assert isinstance(data, bytes)
-            longest = len(data)
-        else:
-            # We have a list: either a list of bytes, or a list of lists.
-            if len(data) == 0 or not isinstance(data[0], list):
-                # inner-most list, should contain bytes if anything
-                assert len(data) == 0 or isinstance(data[0], bytes)
-                # return n-bytes
-                longest = len(data)
-            else:
-                # list of lists: return max length of sub-lists
-                longest = max(get_maxlen(part) for part in data)
-        return longest
-
-    maxlen = get_maxlen(data)
-
-    def extend_all_to_maxlen(data, length, filler=b"\0"):
-        # Extend each "innermost" list (of single bytes) to the required length
-        if isinstance(data, list):
-            if len(data) == 0 or not isinstance(data[0], list):
-                # Pad all the inner-most lists to the required number of elements
-                n_extra = length - len(data)
-                if n_extra > 0:
-                    data = data + [filler] * n_extra
-            else:
-                data = [extend_all_to_maxlen(part, length, filler) for part in data]
-        return data
-
-    data = extend_all_to_maxlen(data, maxlen)
+    data = _make_bytearray_inner(data, bytewidth, encoding)
     # We should now be able to create an array of single bytes.
     result = np.array(data)
     assert result.dtype == "<S1"
@@ -171,7 +135,7 @@ def test_write_strings(self, encoding, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        expected_bytes = make_bytearray(writedata, write_encoding)
+        expected_bytes = make_bytearray(writedata, strlen, write_encoding)
         check_raw_content(path, "vxs", expected_bytes)
 
         # Check also that the "_Encoding" property is as expected
@@ -183,7 +147,8 @@ def test_scalar(self, tempdir):
         # Like 'test_write_strings', but the variable has *only* the string dimension.
         path = tempdir / "test_writestrings_scalar.nc"
 
-        ds_encoded = make_encoded_dataset(path, strlen=5)
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
         v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",))
 
         # Checks that we *can* write a string
@@ -191,14 +156,15 @@ def test_scalar(self, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        expected_bytes = make_bytearray(b"stuff")
+        expected_bytes = make_bytearray(b"stuff", strlen)
         check_raw_content(path, "v0_scalar", expected_bytes)
 
     def test_multidim(self, tempdir):
         # Like 'test_write_strings', but the variable has additional dimensions.
         path = tempdir / "test_writestrings_multidim.nc"
 
-        ds_encoded = make_encoded_dataset(path, strlen=5)
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
         ds_encoded.createDimension("y", 2)
         v = ds_encoded.createVariable(
             "vyxn",
@@ -219,7 +185,7 @@ def test_multidim(self, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        expected_bytes = make_bytearray(test_data)
+        expected_bytes = make_bytearray(test_data, strlen)
         check_raw_content(path, "vyxn", expected_bytes)
 
     def test_write_encoding_failure(self, tempdir):
@@ -236,16 +202,18 @@ def test_write_encoding_failure(self, tempdir):
     def test_overlength(self, tempdir):
         # Check expected behaviour with over-length data
         path = tempdir / "test_writestrings_overlength.nc"
-        ds = make_encoded_dataset(path, strlen=5, encoding="ascii")
+        strlen = 5
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
         v = ds.variables["vxs"]
         v[:] = ["1", "123456789", "two"]
-        expected_bytes = make_bytearray(["1", "12345", "two"])
+        expected_bytes = make_bytearray(["1", "12345", "two"], strlen)
         check_raw_content(path, "vxs", expected_bytes)
 
     def test_overlength_splitcoding(self, tempdir):
         # Check expected behaviour when non-ascii multibyte coding gets truncated
         path = tempdir / "test_writestrings_overlength_splitcoding.nc"
-        ds = make_encoded_dataset(path, strlen=5, encoding="utf-8")
+        strlen = 5
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="utf-8")
         v = ds.variables["vxs"]
         v[:] = ["1", "1234ü", "two"]
         # This creates a problem: it won't read back
@@ -263,7 +231,7 @@ def test_overlength_splitcoding(self, tempdir):
             b"1234\xc3",  # NOTE: truncated encoding
             b"two",
         ]
-        expected_bytearray = make_bytearray(expected_bytes)
+        expected_bytearray = make_bytearray(expected_bytes, strlen)
         check_raw_content(path, "vxs", expected_bytearray)
 
 
@@ -272,9 +240,9 @@ class TestWriteChars:
     def test_write_chars(self, tempdir, write_form):
         encoding = "utf-8"
         write_strings = samples_3_nonascii
-        write_bytes = make_bytearray(write_strings, encoding=encoding)
+        strlen = strings_maxbytes(write_strings, encoding)
+        write_bytes = make_bytearray(write_strings, strlen, encoding=encoding)
         # NOTE: 'flexi' form util decides the width needs to be 7 !!
-        strlen = write_bytes.shape[-1]
         path = tempdir / f"test_writechars_{write_form}.nc"
         ds = make_encoded_dataset(path, encoding=encoding, strlen=strlen)
         v = ds.variables["vxs"]

From b366fd2a729f70715072171cedc33c11df43d148 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 16:23:55 +0000
Subject: [PATCH 18/43] Add read tests.

---
 .../netcdf/_bytecoding_datasets.py            |  38 +++-
 .../netcdf/test_bytecoding_datasets.py        | 165 ++++++++++++++++--
 2 files changed, 184 insertions(+), 19 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 62e1dd2ab7..3bdc799d7f 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -175,7 +175,7 @@ def __getitem__(self, keys):
         if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
             encoding = self._get_encoding() or DEFAULT_READ_ENCODING
             # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
-            strlen = self._get_string_length()
+            strlen = self._get_string_width()
             try:
                 data = decode_bytesarray_to_stringarray(data, encoding, strlen)
             except UnicodeDecodeError as err:
@@ -194,11 +194,11 @@ def __setitem__(self, keys, data):
             # N.B. we never need to UNset this, as we totally control it
             self._contained_instance.set_auto_chartostring(False)
 
-            encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
             # N.B. typically, write encoding default is "ascii" --> fails bad content
             if data.dtype.kind == "U":
                 try:
-                    strlen = self._get_string_length()
+                    encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
+                    strlen = self._get_byte_width()
                     data = encode_stringarray_as_bytearray(data, encoding, strlen)
                 except UnicodeEncodeError as err:
                     msg = (
@@ -230,12 +230,36 @@ def _get_encoding(self) -> str | None:
 
         return result
 
-    def _get_string_length(self):
+    def _get_byte_width(self) -> int | None:
+        if not hasattr(self, "_bytewidth"):
+            n_bytes = self.group().dimensions[self.dimensions[-1]].size
+            # Cache this length control on the variable -- but not as a netcdf attribute
+            self.__dict__["_bytewidth"] = n_bytes
+
+        return self.__dict__["_bytewidth"]
+
+    def _get_string_width(self):
         """Return the string-length defined for this variable."""
         if not hasattr(self, "_strlen"):
-            # Work out the string length from the parent dataset dimensions.
-            strlen = self.group().dimensions[self.dimensions[-1]].size
-            # Cache this on the variable -- but not as a netcdf attribute (!)
+            if hasattr(self, "iris_string_width"):
+                strlen = self.get_ncattr("iris_string_width")
+            else:
+                # Work out the actual byte width from the parent dataset dimensions.
+                strlen = self._get_byte_width()
+                # Convert the string dimension length (i.e. bytes) to a sufficiently-long
+                #  string width, depending on the encoding used.
+                encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+                # regularise the name for comparison with recognised ones
+                encoding = codecs.lookup(encoding).name
+                if "utf-16" in encoding:
+                    # Each char needs at least 2 bytes -- including a terminator char
+                    strlen = (strlen // 2) - 1
+                elif "utf-32" in encoding:
+                    # Each char needs exactly 4 bytes -- including a terminator char
+                    strlen = (strlen // 4) - 1
+                # "ELSE": assume there can be (at most) as many chars as bytes
+
+            # Cache this length control on the variable -- but not as a netcdf attribute
             self.__dict__["_strlen"] = strlen
 
         return self._strlen
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 9ef354f850..5df511103f 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -9,7 +9,10 @@
 import numpy as np
 import pytest
 
-from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
+from iris.fileformats.netcdf._bytecoding_datasets import (
+    DECODE_TO_STRINGS_ON_READ,
+    EncodedDataset,
+)
 from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
 
 encoding_options = [None, "ascii", "utf-8", "utf-32"]
@@ -62,14 +65,17 @@ def fetch_undecoded_var(path, varname):
     return v
 
 
+def check_array_matching(arr1, arr2):
+    """Check for arrays matching shape, dtype and content."""
+    assert (
+        arr1.shape == arr2.shape and arr1.dtype == arr2.dtype and np.all(arr1 == arr2)
+    )
+
+
 def check_raw_content(path, varname, expected_byte_array):
     v = fetch_undecoded_var(path, varname)
     bytes_result = v[:]
-    assert (
-        bytes_result.shape == expected_byte_array.shape
-        and bytes_result.dtype == expected_byte_array.dtype
-        and np.all(bytes_result == expected_byte_array)
-    )
+    check_array_matching(bytes_result, expected_byte_array)
 
 
 def _make_bytearray_inner(data, bytewidth, encoding):
@@ -102,7 +108,7 @@ def make_bytearray(data, bytewidth, encoding="ascii"):
     data = _make_bytearray_inner(data, bytewidth, encoding)
     # We should now be able to create an array of single bytes.
     result = np.array(data)
-    assert result.dtype == "<S1"
+    assert result.dtype == "S1"
     return result
 
 
@@ -113,7 +119,7 @@ class TestWriteStrings:
     which is separately tested -- see 'TestReadStrings'.
     """
 
-    def test_write_strings(self, encoding, tempdir):
+    def test_encodings(self, encoding, tempdir):
         # Create a dataset with the variable
         path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc"
 
@@ -258,8 +264,143 @@ def test_write_chars(self, tempdir, write_form):
         check_raw_content(path, "vxs", write_bytes)
 
 
-class TestReadStrings:
-    """Test how character data is read and converted to strings."""
+class TestRead:
+    """Test how character data is read and converted to strings.
+
+    N.B. many testcases here parallel the 'TestWriteStrings' : we are creating test
+    datafiles with 'make_dataset' and assigning raw bytes, as-per 'TestWriteChars'.
+
+    We are mostly checking here that reading back produces string arrays as expected.
+    However, it is simple + convenient to also check the 'DECODE_TO_STRINGS_ON_READ'
+    function here, i.e. "raw" bytes reads.  So that is also done in this class.
+    """
+
+    @pytest.fixture(params=["strings", "bytes"])
+    def readmode(self, request):
+        return request.param
+
+    def test_encodings(self, encoding, tempdir, readmode):
+        # Create a dataset with the variable
+        path = tempdir / f"test_read_encodings_{encoding!s}_{readmode}.nc"
+
+        if encoding in [None, "ascii"]:
+            write_strings = samples_3_ascii
+            write_encoding = "ascii"
+        else:
+            write_strings = samples_3_nonascii
+            write_encoding = encoding
+
+        write_strings = write_strings.copy()  # just for safety?
+        strlen = strings_maxbytes(write_strings, write_encoding)
+        write_bytes = make_bytearray(write_strings, strlen, encoding=write_encoding)
+
+        ds_encoded = make_encoded_dataset(path, strlen, encoding)
+        v = ds_encoded.variables["vxs"]
+        v[:] = write_bytes
+
+        if readmode == "strings":
+            # Test "normal" read --> string array
+            result = v[:]
+            expected = write_strings
+            if encoding == "utf-8":
+                # In this case, with the given non-ascii sample data, the
+                #  "default minimum string length" is overestimated.
+                assert strlen == 7 and result.dtype == "U7"
+                # correct the result dtype to pass the write_strings comparison below
+                truncated_result = result.astype("U4")
+                # Also check that content is the same (i.e. not actually truncated)
+                assert np.all(truncated_result == result)
+                result = truncated_result
+        else:
+            # Test "raw" read --> byte array
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]
+            expected = write_bytes
+
+        check_array_matching(result, expected)
+
+    def test_scalar(self, tempdir, readmode):
+        # Like 'test_write_strings', but the variable has *only* the string dimension.
+        path = tempdir / f"test_read_scalar_{readmode}.nc"
+
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
+        v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",))
+
+        data_string = "stuff"
+        data_bytes = make_bytearray(data_string, 5)
+
+        # Checks that we *can* write a string
+        v[:] = data_bytes
+
+        if readmode == "strings":
+            # Test "normal" read --> string array
+            result = v[:]
+            expected = np.array(data_string)
+        else:
+            # Test "raw" read --> byte array
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]
+            expected = data_bytes
+
+        check_array_matching(result, expected)
+
+    def test_multidim(self, tempdir, readmode):
+        # Like 'test_write_strings', but the variable has additional dimensions.
+        path = tempdir / f"test_read_multidim_{readmode}.nc"
+
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
+        ds_encoded.createDimension("y", 2)
+        v = ds_encoded.createVariable(
+            "vyxn",
+            "S1",
+            (
+                "y",
+                "x",
+                "strlen",
+            ),
+        )
+
+        # Check that we *can* write a multidimensional string array
+        test_strings = [
+            ["one", "n", ""],
+            ["two", "xxxxx", "four"],
+        ]
+        test_bytes = make_bytearray(test_strings, strlen)
+        v[:] = test_bytes
+
+        if readmode == "strings":
+            # Test "normal" read --> string array
+            result = v[:]
+            expected = np.array(test_strings)
+        else:
+            # Test "raw" read --> byte array
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]
+            expected = test_bytes
+
+        check_array_matching(result, expected)
+
+    def test_read_encoding_failure(self, tempdir, readmode):
+        path = tempdir / f"test_read_encoding_failure_{readmode}.nc"
+        strlen = 10
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
+        v = ds.variables["vxs"]
+        test_utf8_bytes = make_bytearray(
+            samples_3_nonascii, bytewidth=strlen, encoding="utf-8"
+        )
+        v[:] = test_utf8_bytes
+
+        if readmode == "strings":
+            msg = (
+                "Character data in variable 'vxs' could not be decoded "
+                "with the 'ascii' encoding."
+            )
+            with pytest.raises(ValueError, match=msg):
+                v[:]
+        else:
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]  # this ought to be ok!
 
-    def test_encodings(self, encoding):
-        pass
+            assert np.all(result == test_utf8_bytes)

From cf048b242fe89ceadee2cdd144354b2a17bb33fb Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 16:26:13 +0000
Subject: [PATCH 19/43] Remove iris width control (not in this layer).

---
 .../netcdf/_bytecoding_datasets.py            | 31 +++++++++----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 3bdc799d7f..5ed156f3ee 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -241,23 +241,20 @@ def _get_byte_width(self) -> int | None:
     def _get_string_width(self):
         """Return the string-length defined for this variable."""
         if not hasattr(self, "_strlen"):
-            if hasattr(self, "iris_string_width"):
-                strlen = self.get_ncattr("iris_string_width")
-            else:
-                # Work out the actual byte width from the parent dataset dimensions.
-                strlen = self._get_byte_width()
-                # Convert the string dimension length (i.e. bytes) to a sufficiently-long
-                #  string width, depending on the encoding used.
-                encoding = self._get_encoding() or DEFAULT_READ_ENCODING
-                # regularise the name for comparison with recognised ones
-                encoding = codecs.lookup(encoding).name
-                if "utf-16" in encoding:
-                    # Each char needs at least 2 bytes -- including a terminator char
-                    strlen = (strlen // 2) - 1
-                elif "utf-32" in encoding:
-                    # Each char needs exactly 4 bytes -- including a terminator char
-                    strlen = (strlen // 4) - 1
-                # "ELSE": assume there can be (at most) as many chars as bytes
+            # Work out the actual byte width from the parent dataset dimensions.
+            strlen = self._get_byte_width()
+            # Convert the string dimension length (i.e. bytes) to a sufficiently-long
+            #  string width, depending on the encoding used.
+            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+            # regularise the name for comparison with recognised ones
+            encoding = codecs.lookup(encoding).name
+            if "utf-16" in encoding:
+                # Each char needs at least 2 bytes -- including a terminator char
+                strlen = (strlen // 2) - 1
+            elif "utf-32" in encoding:
+                # Each char needs exactly 4 bytes -- including a terminator char
+                strlen = (strlen // 4) - 1
+            # "ELSE": assume there can be (at most) as many chars as bytes
 
             # Cache this length control on the variable -- but not as a netcdf attribute
             self.__dict__["_strlen"] = strlen

From e684d1d8b197ef9ce5d1862ea42b3cae41ec8100 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 17:55:12 +0000
Subject: [PATCH 20/43] more notes

---
 .../fileformats/netcdf/encoding_tests.txt     | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
index 5fa021ccdd..07a0bc3bcd 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -146,25 +146,21 @@ Then, as regards the _Encoding ..
 
 TO TEST...
 ==========
-NOTE on length control:
-    - not an API thing, it's implicit from when you create a variable
-    - this also applies to how it loads back
-        - BUT here there may be scope for a control attribute :
-
 +++ create a dataset + write char data
 +++   - X assign different encodings: makes no difference
 
 +++ create a dataset + write STRING data
 +++   - X encoding=(ascii, utf-8, utf-32, None)
 +++   - X withnonascii=(T, F)
-  - X length=(long, short, none)
+XXXX  - X length=(long, short, none)
+        ***deferred*** to layer above only
 
-read string data
-    - X encoding=(ascii, utf-8, utf-32, None)
-    - X withnonascii=(T, F)
++++ read string data
++++     - X encoding=(ascii, utf-8, utf-32, None)
++++     - X withnonascii=(T, F)
 
-read char data (with control)
-  - X different encodings: make no difference
++++ read char data (with control)
++++   - X different encodings: make no difference
 
 ==rethought==
 write strings
@@ -185,3 +181,11 @@ write char data
 read char data
     - X encodings: don't matter
 
+---
+NOTEs on length control:
+not an API thing, it's implicit from when you create a variable
+this also applies to how it loads back
+BUT here there may be scope for a control attribute :
+  "iris_string_dim" - controls width on creation + reading back
+
+

From a20cc45d8a0cea200d7f6fb3531b8383dfa74c10 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Mon, 19 Jan 2026 14:56:33 +0000
Subject: [PATCH 21/43] Remove temporary test code.

---
 .../unit/fileformats/netcdf/test_nc_dtypes.py | 96 -------------------
 1 file changed, 96 deletions(-)
 delete mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py

diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py
deleted file mode 100644
index 0c5d2b279e..0000000000
--- a/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright Iris contributors
-#
-# This file is part of Iris and is released under the BSD license.
-# See LICENSE in the root of the repository for full licensing details.
-"""Temporary code to confirm how various numpy dtypes are stored in a netcdf file."""
-
-import netCDF4 as nc
-import numpy as np
-import pytest
-
-from iris.tests.integration.netcdf.test_chararrays import ncdump
-
-# types = [
-#     "i1",  # np.int8
-#     "u1",  # np.uint8
-#     "S1",  # np.byte_
-#     "U1",  # np.str_
-#     "S",  # multibytes
-#     "U",  # unicode strings, with/without non-ascii content
-# ]
-
-samples = {
-    "i1": [-5, 7, 35],  # np.int8
-    "u1": [65, 67, 90],  # np.uint8
-    "S1": [b"A", b"B", b"Z"],  # np.byte_
-    "U1": ["A", "B", "C"],  # np.str_
-    "S": [b"one21", b"three", b""],  # multibyte
-    "U": ["one", "éclair", "nine"],  # unicode strings
-}
-sample_arrays = {
-    type_code: np.array(values, dtype=type_code)
-    for type_code, values in samples.items()
-}
-
-
-@pytest.fixture(scope="module")
-def tmpdir(tmp_path_factory):
-    return tmp_path_factory.mktemp("netcdf")
-
-
-def create_file(array: np.ndarray, path):
-    with nc.Dataset(str(path), "w") as ds:
-        ds.createDimension("x", 3)
-        v = ds.createVariable("vx", array.dtype, ("x",))
-        # v.set_auto_chartostring(False)
-        v._Encoding = "UTF-8" if array.dtype.kind == "U" else "ascii"
-        v[:] = array
-
-
-def get_loadback_array(path):
-    with nc.Dataset(str(path), "r") as ds:
-        v = ds.variables["vx"]
-        v.set_auto_chartostring(False)
-        result = v[:]
-    return result
-
-
-@pytest.mark.parametrize("dtype", list(samples.keys()))
-def test(tmpdir, dtype):
-    arr = sample_arrays[dtype]
-    print("\n---")
-    print(dtype)
-    path = tmpdir / f"tmp_{dtype}.nc"
-    create_file(arr, path)
-    ncdump(path, "-s")
-    loadback_array = get_loadback_array(path)
-    print(f"  SPEC:{dtype} SAVED-AS:{arr.dtype} RELOAD-AS:{loadback_array.dtype}")
-
-
-# from iris.tests import env_bin_path
-# NCGEN_PATHSTR = str(env_bin_path("ncgen"))
-#
-#
-# def ncgen(cdl_path, nc_path, *args):
-#     """Call ncdump to print a dump of a file."""
-#     args = list(args)
-#     if not any(arg.startswith('-k') for arg in args):
-#         args[:0] = ["-k", "nc4"]  # force netcdf4
-#     call_args = [NCGEN_PATHSTR] + list(args) + [str(cdl_path), '-o', str(nc_path)]
-#     subprocess.check_call(call_args)
-#
-#
-# def test_uchar(tmpdir):
-#     arr = sample_arrays["S1"]
-#     path = tmpdir / f"tmp_ichar.nc"
-#     create_file(arr, path)
-#     text = ncdump(path, "-s")
-#     text_u = text.replace("\t", "   ")
-#     text_u = text_u.replace(" char ", " unsigned char ")
-#     cdl_path = tmpdir / f"tmp_uchar.cdl"
-#     with open(cdl_path, "w") as f_out:
-#         f_out.write(text_u)
-#     nc_path_2 = tmpdir / f"tmp_uchar.nc"
-#     ncgen(cdl_path, nc_path_2)
-#     loadback_array = get_loadback_array(nc_path_2)
-#     print(f"  netcdf type 'uchar' LOADS-AS:{loadback_array.dtype}")

From c995a8df4bfb59b44f1dba41ea6e6a62410ec1a4 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Mon, 19 Jan 2026 15:40:26 +0000
Subject: [PATCH 22/43] Use iris categorised warnings for unknown encodings.

---
 .../netcdf/_bytecoding_datasets.py            | 19 +++++++---
 .../netcdf/test_bytecoding_datasets.py        | 36 ++++++++++++++++---
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 5ed156f3ee..f1fe184729 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -48,6 +48,8 @@
 import numpy as np
 
 from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
+import iris.warnings
+from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning
 
 
 def decode_bytesarray_to_stringarray(
@@ -197,7 +199,9 @@ def __setitem__(self, keys, data):
             # N.B. typically, write encoding default is "ascii" --> fails bad content
             if data.dtype.kind == "U":
                 try:
-                    encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
+                    encoding = (
+                        self._get_encoding(writing=True) or DEFAULT_WRITE_ENCODING
+                    )
                     strlen = self._get_byte_width()
                     data = encode_stringarray_as_bytearray(data, encoding, strlen)
                 except UnicodeEncodeError as err:
@@ -214,7 +218,7 @@ def __setitem__(self, keys, data):
     def _is_chardata(self):
         return np.issubdtype(self.dtype, np.bytes_)
 
-    def _get_encoding(self) -> str | None:
+    def _get_encoding(self, writing=False) -> str | None:
         """Get the byte encoding defined for this variable (or None)."""
         result = getattr(self, "_Encoding", None)
         if result is not None:
@@ -225,9 +229,14 @@ def _get_encoding(self) -> str | None:
                 # For example, _Encoding = "ascii", with non-ascii content.
             except LookupError:
                 # Unrecognised encoding name : handle this as just a warning
-                msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
-                warnings.warn(msg, UserWarning)
-
+                msg = (
+                    f"Ignoring unknown encoding for variable {self.name!r}: "
+                    f"_Encoding = {result!r}."
+                )
+                warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
+                warnings.warn(msg, warntype)
+                # Proceed as if there is no specified encoding
+                result = None
         return result
 
     def _get_byte_width(self) -> int | None:
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 5df511103f..861ec2c516 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -14,6 +14,7 @@
     EncodedDataset,
 )
 from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
+from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning
 
 encoding_options = [None, "ascii", "utf-8", "utf-32"]
 
@@ -194,17 +195,29 @@ def test_multidim(self, tempdir):
         expected_bytes = make_bytearray(test_data, strlen)
         check_raw_content(path, "vyxn", expected_bytes)
 
-    def test_write_encoding_failure(self, tempdir):
-        path = tempdir / "test_writestrings_encoding_failure.nc"
-        ds = make_encoded_dataset(path, strlen=5, encoding="ascii")
+    @pytest.mark.parametrize("encoding", [None, "ascii"])
+    def test_write_encoding_failure(self, tempdir, encoding):
+        path = tempdir / f"test_writestrings_encoding_{encoding}_fail.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding=encoding)
         v = ds.variables["vxs"]
+        encoding_name = encoding
+        if encoding_name == None:
+            encoding_name = "ascii"
         msg = (
             "String data written to netcdf character variable 'vxs'.*"
-            " could not be represented in encoding 'ascii'. "
+            f" could not be represented in encoding '{encoding_name}'. "
         )
         with pytest.raises(ValueError, match=msg):
             v[:] = samples_3_nonascii
 
+    def test_write_badencoding_ignore(self, tempdir):
+        path = tempdir / "test_writestrings_badencoding_ignore.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding="unknown")
+        v = ds.variables["vxs"]
+        msg = r"Ignoring unknown encoding for variable 'vxs': _Encoding = 'unknown'\."
+        with pytest.warns(IrisCfSaveWarning, match=msg):
+            v[:] = samples_3_ascii  # will work OK
+
     def test_overlength(self, tempdir):
         # Check expected behaviour with over-length data
         path = tempdir / "test_writestrings_overlength.nc"
@@ -404,3 +417,18 @@ def test_read_encoding_failure(self, tempdir, readmode):
                 result = v[:]  # this ought to be ok!
 
             assert np.all(result == test_utf8_bytes)
+
+    def test_read_badencoding_ignore(self, tempdir):
+        path = tempdir / f"test_read_badencoding_ignore.nc"
+        strlen = 10
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="unknown")
+        v = ds.variables["vxs"]
+        test_utf8_bytes = make_bytearray(
+            samples_3_nonascii, bytewidth=strlen, encoding="utf-8"
+        )
+        v[:] = test_utf8_bytes
+
+        msg = r"Ignoring unknown encoding for variable 'vxs': _Encoding = 'unknown'\."
+        with pytest.warns(IrisCfLoadWarning, match=msg):
+            # raises warning but succeeds, due to default read encoding of 'utf-8'
+            v[:]

From f118c18117c1d02c1beac968d45fda75bff88103 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Mon, 19 Jan 2026 15:54:46 +0000
Subject: [PATCH 23/43] Clarify the temporary load/save exercising tests (a
 bit).

---
 .../tests/integration/netcdf/test_chararrays.py  | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index 4414444733..3a4a3e1879 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -189,7 +189,7 @@ def save_dir(tmp_path_factory):
 
 # TODO: the tests don't test things properly yet, they just exercise the code and print
 #  things for manual debugging.
-tsts = (
+test_encodings = (
     None,
     "ascii",
     "utf-8",
@@ -201,8 +201,13 @@ def save_dir(tmp_path_factory):
 # tsts = ("utf-8", "ascii", "utf-8")
 
 
-@pytest.mark.parametrize("encoding", tsts)
+@pytest.mark.parametrize("encoding", test_encodings)
 def test_load_encodings(encoding, save_dir):
+    """Load exercise.
+
+    Make a testfile with utf-8 content, variously labelled.
+    Load with Iris + show result (error or cubes).
+    """
     # small change
     print(f"\n=========\nTesting encoding: {encoding}")
     filepath = save_dir / f"tmp_load_{str(encoding)}.nc"
@@ -219,8 +224,13 @@ def test_load_encodings(encoding, save_dir):
     show_result(filepath)
 
 
-@pytest.mark.parametrize("encoding", tsts)
+@pytest.mark.parametrize("encoding", test_encodings)
 def test_save_encodings(encoding, save_dir):
+    """Save exercise.
+
+    Make test-cube with non-ascii content, and various '_Encoding' labels.
+    Save with Iris + show result (error or ncdump).
+    """
     cube = make_testcube(
         dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding
     )

From c8a27df7e2f38640d9f963d5d3fcad626f85c509 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Sat, 17 Jan 2026 18:11:44 +0000
Subject: [PATCH 24/43] Use bytecoded_datasets in nc load+save, begin fixes.

---
 lib/iris/fileformats/cf.py                    |  6 +-
 .../netcdf/_bytecoding_datasets.py            | 29 +++++++-
 .../fileformats/netcdf/_thread_safe_nc.py     |  7 +-
 lib/iris/fileformats/netcdf/loader.py         |  4 +-
 lib/iris/fileformats/netcdf/saver.py          | 67 +++++++++++--------
 .../integration/netcdf/test_chararrays.py     | 13 +++-
 6 files changed, 87 insertions(+), 39 deletions(-)

diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
index 5abc525109..6e4b8f99e1 100644
--- a/lib/iris/fileformats/cf.py
+++ b/lib/iris/fileformats/cf.py
@@ -26,7 +26,7 @@
 
 import iris.exceptions
 import iris.fileformats._nc_load_rules.helpers as hh
-from iris.fileformats.netcdf import _thread_safe_nc
+from iris.fileformats.netcdf import _bytecoding_datasets
 from iris.mesh.components import Connectivity
 import iris.util
 import iris.warnings
@@ -1373,7 +1373,9 @@ def __init__(self, file_source, warn=False, monotonic=False):
         if isinstance(file_source, str):
             # Create from filepath : open it + own it (=close when we die).
             self._filename = os.path.expanduser(file_source)
-            self._dataset = _thread_safe_nc.DatasetWrapper(self._filename, mode="r")
+            self._dataset = _bytecoding_datasets.EncodedDataset(
+                self._filename, mode="r"
+            )
             self._own_file = True
         else:
             # We have been passed an open dataset.
diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index f1fe184729..a8dfca2b21 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -47,7 +47,12 @@
 
 import numpy as np
 
-from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
+from iris.fileformats.netcdf._thread_safe_nc import (
+    DatasetWrapper,
+    NetCDFDataProxy,
+    NetCDFWriteProxy,
+    VariableWrapper,
+)
 import iris.warnings
 from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning
 
@@ -133,7 +138,19 @@ def encode_stringarray_as_bytearray(
     result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
     right_pad = b"\0" * string_dimension_length
     for index in np.ndindex(element_shape):
-        bytes = data[index].encode(encoding=encoding)
+        string = data[index]
+        bytes = string.encode(encoding=encoding)
+        n_bytes = len(bytes)
+        # TODO: may want to issue warning or error if we overflow the length?
+        if n_bytes > string_dimension_length:
+            from iris.exceptions import TranslationError
+
+            msg = (
+                f"Non-ascii string {string!r} written to netcdf exceeds string "
+                f"dimension : {n_bytes} > {string_dimension_length}."
+            )
+            raise TranslationError(msg)
+
         # It's all a bit nasty ...
         bytes = (bytes + right_pad)[:string_dimension_length]
         result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
@@ -283,3 +300,11 @@ class EncodedDataset(DatasetWrapper):
     def set_auto_chartostring(self, onoff: bool):
         msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
         raise TypeError(msg)
+
+
+class EncodedNetCDFDataProxy(NetCDFDataProxy):
+    DATASET_CLASS = EncodedDataset
+
+
+class EncodedNetCDFWriteProxy(NetCDFWriteProxy):
+    DATASET_CLASS = EncodedDataset
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
index 46b8609bb7..cd97452dac 100644
--- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py
+++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -315,6 +315,7 @@ class NetCDFDataProxy:
     """A reference to the data payload of a single NetCDF file variable."""
 
     __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")
+    DATASET_CLASS = netCDF4.Dataset
 
     def __init__(self, shape, dtype, path, variable_name, fill_value):
         self.shape = shape
@@ -337,7 +338,7 @@ def __getitem__(self, keys):
         # netCDF4 library, presumably because __getitem__ gets called so many
         # times by Dask. Use _GLOBAL_NETCDF4_LOCK directly instead.
         with _GLOBAL_NETCDF4_LOCK:
-            dataset = netCDF4.Dataset(self.path)
+            dataset = self.DATASET_CLASS(self.path)
             try:
                 variable = dataset.variables[self.variable_name]
                 # Get the NetCDF variable data and slice.
@@ -374,6 +375,8 @@ class NetCDFWriteProxy:
     TODO: could be improved with a caching scheme, but this just about works.
     """
 
+    DATASET_CLASS = netCDF4.Dataset
+
     def __init__(self, filepath, cf_var, file_write_lock):
         self.path = filepath
         self.varname = cf_var.name
@@ -401,7 +404,7 @@ def __setitem__(self, keys, array_data):
                 #  investigation needed.
                 for attempt in range(5):
                     try:
-                        dataset = netCDF4.Dataset(self.path, "r+")
+                        dataset = self.DATASET_CLASS(self.path, "r+")
                         break
                     except OSError:
                         if attempt < 4:
diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
index 219f681e67..d363e29738 100644
--- a/lib/iris/fileformats/netcdf/loader.py
+++ b/lib/iris/fileformats/netcdf/loader.py
@@ -36,7 +36,7 @@
 import iris.coord_systems
 import iris.coords
 import iris.fileformats.cf
-from iris.fileformats.netcdf import _thread_safe_nc
+from iris.fileformats.netcdf import _bytecoding_datasets, _thread_safe_nc
 from iris.fileformats.netcdf.saver import _CF_ATTRS
 import iris.io
 import iris.util
@@ -50,7 +50,7 @@
 
 # An expected part of the public loader API, but includes thread safety
 #  concerns so is housed in _thread_safe_nc.
-NetCDFDataProxy = _thread_safe_nc.NetCDFDataProxy
+NetCDFDataProxy = _bytecoding_datasets.EncodedNetCDFDataProxy
 
 
 class _WarnComboIgnoringBoundsLoad(
diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index 3d7c1dee19..f832ad1e8d 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -14,6 +14,7 @@
 
 """
 
+import codecs
 import collections
 from itertools import repeat, zip_longest
 import os
@@ -48,7 +49,8 @@
 from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, DimCoord
 import iris.exceptions
 import iris.fileformats.cf
-from iris.fileformats.netcdf import _dask_locks, _thread_safe_nc
+from iris.fileformats.netcdf import _bytecoding_datasets as bytecoding_datasets
+from iris.fileformats.netcdf import _dask_locks
 from iris.fileformats.netcdf._attribute_handlers import ATTRIBUTE_HANDLERS
 import iris.io
 import iris.util
@@ -300,7 +302,7 @@ class VariableEmulator(typing.Protocol):
     shape: tuple[int, ...]
 
 
-CFVariable = typing.Union[_thread_safe_nc.VariableWrapper, VariableEmulator]
+CFVariable = typing.Union[bytecoding_datasets.VariableWrapper, VariableEmulator]
 
 
 class Saver:
@@ -403,7 +405,7 @@ def __init__(self, filename, netcdf_format, compute=True):
             # Put it inside a _thread_safe_nc wrapper to ensure thread-safety.
             # Except if it already is one, since they forbid "re-wrapping".
             if not hasattr(self._dataset, "THREAD_SAFE_FLAG"):
-                self._dataset = _thread_safe_nc.DatasetWrapper.from_existing(
+                self._dataset = bytecoding_datasets.DatasetWrapper.from_existing(
                     self._dataset
                 )
 
@@ -414,7 +416,7 @@ def __init__(self, filename, netcdf_format, compute=True):
             # Given a filepath string/path : create a dataset from that
             try:
                 self.filepath = os.path.abspath(filename)
-                self._dataset = _thread_safe_nc.DatasetWrapper(
+                self._dataset = bytecoding_datasets.EncodedDataset(
                     self.filepath, mode="w", format=netcdf_format
                 )
             except RuntimeError:
@@ -1818,7 +1820,15 @@ def _create_generic_cf_array_var(
             # Typically CF label variables, but also possibly ancil-vars ?
             string_dimension_depth = data.dtype.itemsize
             if data.dtype.kind == "U":
-                string_dimension_depth //= 4
+                encoding = element.attributes.get("_Encoding", "ascii")
+                # TODO: this can fail -- use a sensible warning + default?
+                encoding = codecs.lookup(encoding).name
+                if encoding == "utf-32":
+                    # UTF-32 is a special case -- always 4 exactly bytes per char, plus 4
+                    string_dimension_depth += 4
+                else:
+                    # generally, 4 bytes per char in numpy --> make bytewidth = string-width
+                    string_dimension_depth //= 4
             string_dimension_name = "string%d" % string_dimension_depth
 
             # Determine whether to create the string length dimension.
@@ -1837,25 +1847,25 @@ def _create_generic_cf_array_var(
             # Create the label coordinate variable.
             cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
 
-            # Convert data from an array of strings into a character array
-            # with an extra string-length dimension.
-            if len(element_dims) == 1:
-                # Scalar variable (only has string dimension).
-                data_first = data[0]
-                if is_lazy_data(data_first):
-                    data_first = dask.compute(data_first)
-                data = list("%- *s" % (string_dimension_depth, data_first))
-            else:
-                # NOTE: at present, can't do this lazily??
-                orig_shape = data.shape
-                new_shape = orig_shape + (string_dimension_depth,)
-                new_data = np.zeros(new_shape, cf_var.dtype)
-                for index in np.ndindex(orig_shape):
-                    index_slice = tuple(list(index) + [slice(None, None)])
-                    new_data[index_slice] = list(
-                        "%- *s" % (string_dimension_depth, data[index])
-                    )
-                data = new_data
+            # # Convert data from an array of strings into a character array
+            # # with an extra string-length dimension.
+            # if len(element_dims) == 1:
+            #     # Scalar variable (only has string dimension).
+            #     data_first = data[0]
+            #     if is_lazy_data(data_first):
+            #         data_first = dask.compute(data_first)
+            #     data = list("%- *s" % (string_dimension_depth, data_first))
+            # else:
+            #     # NOTE: at present, can't do this lazily??
+            #     orig_shape = data.shape
+            #     new_shape = orig_shape + (string_dimension_depth,)
+            #     new_data = np.zeros(new_shape, cf_var.dtype)
+            #     for index in np.ndindex(orig_shape):
+            #         index_slice = tuple(list(index) + [slice(None, None)])
+            #         new_data[index_slice] = list(
+            #             "%- *s" % (string_dimension_depth, data[index])
+            #         )
+            #     data = new_data
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.
@@ -1899,6 +1909,10 @@ def _create_generic_cf_array_var(
                 element, cf_var, cf_name, compression_kwargs=compression_kwargs
             )
 
+        # Add names + units
+        # NOTE: *must* now do first, as we may need '_Encoding' set to write it !
+        self._set_cf_var_attributes(cf_var, element)
+
         # Add the data to the CF-netCDF variable.
         if not is_dataless:
             if packing_controls:
@@ -1907,9 +1921,6 @@ def _create_generic_cf_array_var(
                     _setncattr(cf_var, key, value)
             self._lazy_stream_data(data=data, cf_var=cf_var)
 
-        # Add names + units
-        self._set_cf_var_attributes(cf_var, element)
-
         return cf_name
 
     def _create_cf_cell_methods(self, cube, dimension_names):
@@ -2529,7 +2540,7 @@ def store(
                 ) -> None:
                     # Create a data-writeable object that we can stream into, which
                     # encapsulates the file to be opened + variable to be written.
-                    write_wrapper = _thread_safe_nc.NetCDFWriteProxy(
+                    write_wrapper = bytecoding_datasets.EncodedNetCDFWriteProxy(
                         self.filepath, cf_var, self.file_write_lock
                     )
                     # Add to the list of delayed writes, used in delayed_completion().
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index 3a4a3e1879..f3bba81c70 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -12,7 +12,9 @@
 import iris
 from iris.coords import AuxCoord, DimCoord
 from iris.cube import Cube
-from iris.fileformats.netcdf import _thread_safe_nc
+from iris.fileformats.netcdf import _bytecoding_datasets
+
+# from iris.fileformats.netcdf import _thread_safe_nc
 from iris.tests import env_bin_path
 
 NX, N_STRLEN = 3, 64
@@ -22,7 +24,8 @@
 # VARS_COORDS_SHARE_STRING_DIM = True
 VARS_COORDS_SHARE_STRING_DIM = False
 if VARS_COORDS_SHARE_STRING_DIM:
-    TEST_COORD_VALS[-1] = "Xsandwich"  # makes the max coord strlen same as data one
+    # Fix length so that the max coord strlen will be same as data one
+    TEST_COORD_VALS[-1] = "Xsandwich"
 
 
 # Ensure all tests run with "split attrs" turned on.
@@ -68,8 +71,12 @@ def convert_bytesarray_to_strings(
 # INCLUDE_NUMERIC_AUXCOORD = False
 
 
+# DATASET_CLASS = _thread_safe_nc.DatasetWrapper
+DATASET_CLASS = _bytecoding_datasets.EncodedDataset
+
+
 def make_testfile(filepath, chararray, coordarray, encoding_str=None):
-    ds = _thread_safe_nc.DatasetWrapper(filepath, "w")
+    ds = DATASET_CLASS(filepath, "w")
     try:
         ds.createDimension("x", NX)
         ds.createDimension("nstr", N_STRLEN)

From c4a31a48c45da667c003aad02fc3caeead58474f Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Mon, 19 Jan 2026 16:18:29 +0000
Subject: [PATCH 25/43] Further attempt to satisfy warning cateogry checker.

---
 lib/iris/fileformats/netcdf/_bytecoding_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index a8dfca2b21..52e2fe2aa5 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -251,7 +251,7 @@ def _get_encoding(self, writing=False) -> str | None:
                     f"_Encoding = {result!r}."
                 )
                 warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
-                warnings.warn(msg, warntype)
+                warnings.warn(msg, category=warntype)
                 # Proceed as if there is no specified encoding
                 result = None
         return result

From 10831d77b1c8743caa2eb8fb0baaa3294e6c4842 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Mon, 19 Jan 2026 16:41:46 +0000
Subject: [PATCH 26/43] Fix overlength error tests.

---
 .../netcdf/_bytecoding_datasets.py            |  4 ++--
 .../netcdf/test_bytecoding_datasets.py        | 20 +++++++++++++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 52e2fe2aa5..a3a13f86f5 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -146,8 +146,8 @@ def encode_stringarray_as_bytearray(
             from iris.exceptions import TranslationError
 
             msg = (
-                f"Non-ascii string {string!r} written to netcdf exceeds string "
-                f"dimension : {n_bytes} > {string_dimension_length}."
+                f"String {string!r} written to netcdf exceeds string dimension after "
+                f"encoding : {n_bytes} > {string_dimension_length}."
             )
             raise TranslationError(msg)
 
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 861ec2c516..4909d976de 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pytest
 
+from iris.exceptions import TranslationError
 from iris.fileformats.netcdf._bytecoding_datasets import (
     DECODE_TO_STRINGS_ON_READ,
     EncodedDataset,
@@ -224,9 +225,9 @@ def test_overlength(self, tempdir):
         strlen = 5
         ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
         v = ds.variables["vxs"]
-        v[:] = ["1", "123456789", "two"]
-        expected_bytes = make_bytearray(["1", "12345", "two"], strlen)
-        check_raw_content(path, "vxs", expected_bytes)
+        msg = r"String .* written to netcdf exceeds string dimension .* : [0-9]* > 5\."
+        with pytest.raises(TranslationError, match=msg):
+            v[:] = ["1", "123456789", "two"]
 
     def test_overlength_splitcoding(self, tempdir):
         # Check expected behaviour when non-ascii multibyte coding gets truncated
@@ -234,7 +235,18 @@ def test_overlength_splitcoding(self, tempdir):
         strlen = 5
         ds = make_encoded_dataset(path, strlen=strlen, encoding="utf-8")
         v = ds.variables["vxs"]
-        v[:] = ["1", "1234ü", "two"]
+        # Note: we must do the assignment as a single byte array, to avoid hitting the
+        #  safety check for this exact problem : see previous check.
+        byte_arrays = [
+            string.encode("utf-8")[:strlen] for string in ("1", "1234ü", "two")
+        ]
+        nd_bytes_array = np.array(
+            [
+                [bytes[i : i + 1] if i < len(bytes) else b"\0" for i in range(strlen)]
+                for bytes in byte_arrays
+            ]
+        )
+        v[:] = nd_bytes_array
         # This creates a problem: it won't read back
         msg = (
             "Character data in variable 'vxs' could not be decoded "

From 042028e481c4b5a01073c9aca8d296ad51c56895 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Mon, 19 Jan 2026 17:10:04 +0000
Subject: [PATCH 27/43] Get temporary iris load/save exercises working (todo:
 proper tests).

---
 lib/iris/fileformats/netcdf/saver.py          |  6 ++++++
 .../integration/netcdf/test_chararrays.py     | 20 +++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index f832ad1e8d..d43df538c2 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1713,6 +1713,12 @@ def add_names_attrs():
         if element.units.calendar:
             _setncattr(cf_var, "calendar", str(element.units.calendar))
 
+        # Most attributes are dealt with later.
+        # But _Encoding need to be defined before we can write to a character variable
+        if element.dtype.kind in "SU" and "_Encoding" in element.attributes:
+            encoding = element.attributes.pop("_Encoding")
+            _setncattr(cf_var, "_Encoding", encoding)
+
         if not isinstance(element, Cube):
             # Add any other custom coordinate attributes.
             # N.B. not Cube, which has specific handling in  _create_cf_data_variable
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index f3bba81c70..496867ee8a 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -218,6 +218,7 @@ def test_load_encodings(encoding, save_dir):
     # small change
     print(f"\n=========\nTesting encoding: {encoding}")
     filepath = save_dir / f"tmp_load_{str(encoding)}.nc"
+    # Actual content is always either utf-8 or utf-32
     do_as = encoding
     if encoding != "utf-32":
         do_as = "utf-8"
@@ -228,7 +229,14 @@ def test_load_encodings(encoding, save_dir):
         TEST_COORD_VALS, N_STRLEN, encoding=do_as
     )
     make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
-    show_result(filepath)
+    if encoding == "ascii":
+        # If explicitly labelled as ascii, 'utf-8' data will fail to load back ...
+        msg = r"Character data .* could not be decoded with the 'ascii' encoding\."
+        with pytest.raises(ValueError, match=msg):
+            show_result(filepath)
+    else:
+        # ... otherwise, utf-8 data loads even without a label, as 'utf-8' default used
+        show_result(filepath)
 
 
 @pytest.mark.parametrize("encoding", test_encodings)
@@ -243,10 +251,14 @@ def test_save_encodings(encoding, save_dir):
     )
     print(cube)
     filepath = save_dir / f"tmp_save_{str(encoding)}.nc"
-    if encoding == "ascii":
+    if encoding in ("ascii", None):
+        msg = (
+            "String data written to netcdf character variable 'v' "
+            "could not be represented in encoding 'ascii'"
+        )
         with pytest.raises(
-            UnicodeEncodeError,
-            match="'ascii' codec can't encode character.*not in range",
+            ValueError,
+            match=msg,
         ):
             iris.save(cube, filepath)
     else:

From 94b2b217281f085f906c441d59e335fe7b768875 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 21 Jan 2026 16:19:27 +0000
Subject: [PATCH 28/43] Put encoding information into separate converter class,
 for use in proxies.

---
 .../netcdf/_bytecoding_datasets.py            | 290 +++++++++---------
 .../fileformats/netcdf/_thread_safe_nc.py     |  27 +-
 2 files changed, 161 insertions(+), 156 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index a3a13f86f5..4559f4b78b 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -42,6 +42,7 @@
 
 import codecs
 import contextlib
+import dataclasses
 import threading
 import warnings
 
@@ -80,55 +81,6 @@ def decode_bytesarray_to_stringarray(
     return result
 
 
-#
-# TODO: remove?
-# this older version is "overly flexible", less efficient and not needed here.
-#
-def flexi_encode_stringarray_as_bytearray(
-    data: np.ndarray, encoding=None, string_dimension_length: int | None = None
-) -> np.ndarray:
-    """Encode strings as bytearray.
-
-    Note: if 'string_dimension_length' is not given (None), it is set to the longest
-    encoded bytes element, **OR** the dtype size, if that is greater.
-    If 'string_dimension_length' is specified, the last array
-    dimension is set to this and content strings are truncated or extended as required.
-    """
-    if np.ma.isMaskedArray(data):
-        # netCDF4-python sees zeros as "missing" -- we don't need or want that
-        data = data.data
-    element_shape = data.shape
-    # Encode all the strings + see which is longest
-    max_length = 1  # this is a MINIMUM - i.e. not zero!
-    data_elements = np.zeros(element_shape, dtype=object)
-    for index in np.ndindex(element_shape):
-        data_element = data[index].encode(encoding=encoding)
-        element_length = len(data_element)
-        data_elements[index] = data_element
-        if element_length > max_length:
-            max_length = element_length
-
-    if string_dimension_length is None:
-        # If the string length was not specified, it is the maximum encoded length
-        # (n-bytes), **or** the dtype string-length, if greater.
-        string_dimension_length = max_length
-        array_string_length = int(str(data.dtype)[2:])  # Yuck. No better public way?
-        if array_string_length > string_dimension_length:
-            string_dimension_length = array_string_length
-
-    # We maybe *already* encoded all the strings above, but stored them in an
-    #  object-array as we didn't yet know the fixed byte-length to convert to.
-    # Now convert to a fixed-width byte array with an extra string-length dimension
-    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
-    right_pad = b"\0" * string_dimension_length
-    for index in np.ndindex(element_shape):
-        bytes = data_elements[index]
-        bytes = (bytes + right_pad)[:string_dimension_length]
-        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
-
-    return result
-
-
 def encode_stringarray_as_bytearray(
     data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
 ) -> np.ndarray:
@@ -158,6 +110,114 @@ def encode_stringarray_as_bytearray(
     return result
 
 
+@dataclasses.dataclass
+class VariableEncoder:
+    """A record of encoding details which can apply them to variable data."""
+
+    varname: str  # just for the error messages
+    dtype: np.dtype
+    is_chardata: bool  # just a shortcut for the dtype test
+    read_encoding: str  # *always* a valid encoding from the codecs package
+    write_encoding: str  # *always* a valid encoding from the codecs package
+    n_chars_dim: int  # length of associated character dimension
+    string_width: int  # string lengths when viewing as strings (i.e. "Uxx")
+
+    def __init__(self, cf_var):
+        """Get all the info from an netCDF4 variable (or similar wrapper object).
+
+        Most importantly, we do *not* store 'cf_var' : instead we extract the
+        necessary information and store it in this object.
+        So, this object has static state + is serialisable.
+        """
+        self.varname = cf_var.name
+        self.dtype = cf_var.dtype
+        self.is_chardata = np.issubdtype(self.dtype, np.bytes_)
+        self.read_encoding = self._get_encoding(cf_var, writing=False)
+        self.write_encoding = self._get_encoding(cf_var, writing=True)
+        self.n_chars_dim = cf_var.group().dimensions[cf_var.dimensions[-1]].size
+        self.string_width = self._get_string_width(cf_var)
+
+    @staticmethod
+    def _get_encoding(cf_var, writing=False) -> str:
+        """Get the byte encoding defined for this variable (or None)."""
+        result = getattr(cf_var, "_Encoding", None)
+        if result is not None:
+            try:
+                # Accept + normalise naming of encodings
+                result = codecs.lookup(result).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Unrecognised encoding name : handle this as just a warning
+                msg = (
+                    f"Ignoring unknown encoding for variable {cf_var.name!r}: "
+                    f"_Encoding = {result!r}."
+                )
+                warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
+                warnings.warn(msg, category=warntype)
+                # Proceed as if there is no specified encoding
+                result = None
+
+        if result is None:
+            if writing:
+                result = DEFAULT_WRITE_ENCODING
+            else:
+                result = DEFAULT_READ_ENCODING
+        return result
+
+    def _get_string_width(self, cf_var) -> int:
+        """Return the string-length defined for this variable."""
+        # Work out the actual byte width from the parent dataset dimensions.
+        strlen = self.n_chars_dim
+        # Convert the string dimension length (i.e. bytes) to a sufficiently-long
+        #  string width, depending on the (read) encoding used.
+        encoding = self.read_encoding
+        if "utf-16" in encoding:
+            # Each char needs at least 2 bytes -- including a terminator char
+            strlen = (strlen // 2) - 1
+        elif "utf-32" in encoding:
+            # Each char needs exactly 4 bytes -- including a terminator char
+            strlen = (strlen // 4) - 1
+        # "ELSE": assume there can be (at most) as many chars as bytes
+        return strlen
+
+    def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
+        if self.is_chardata and DECODE_TO_STRINGS_ON_READ:
+            # N.B. read encoding default is UTF-8 --> a "usually safe" choice
+            encoding = self.read_encoding
+            strlen = self.string_width
+            try:
+                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+            except UnicodeDecodeError as err:
+                msg = (
+                    f"Character data in variable {self.varname!r} could not be decoded "
+                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
+                    "variable '_Encoding' attribute to suit the content."
+                )
+                raise ValueError(msg) from err
+
+        return data
+
+    def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray:
+        if data.dtype.kind == "U":
+            # N.B. it is also possible to pass a byte array (dtype "S1"),
+            #  to be written directly, without processing.
+            try:
+                # N.B. write encoding *default* is "ascii" --> fails bad content
+                encoding = self.write_encoding
+                strlen = self.n_chars_dim
+                data = encode_stringarray_as_bytearray(data, encoding, strlen)
+            except UnicodeEncodeError as err:
+                msg = (
+                    f"String data written to netcdf character variable {self.varname!r} "
+                    f"could not be represented in encoding {self.write_encoding!r}.  "
+                    "This can be fixed by setting a suitable variable '_Encoding' "
+                    'attribute, e.g. <variable>._Encoding="UTF-8".'
+                )
+                raise ValueError(msg) from err
+        return data
+
+
 class NetcdfStringDecodeSetting(threading.local):
     def __init__(self, perform_encoding: bool = True):
         self.set(perform_encoding)
@@ -184,109 +244,24 @@ def context(self, perform_encoding: bool):
 class EncodedVariable(VariableWrapper):
     """A variable wrapper that translates variable data according to byte encodings."""
 
-    def __getitem__(self, keys):
-        if self._is_chardata():
-            # N.B. we never need to UNset this, as we totally control it
-            self._contained_instance.set_auto_chartostring(False)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
+    def __getitem__(self, keys):
+        self._contained_instance.set_auto_chartostring(False)
         data = super().__getitem__(keys)
-
-        if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
-            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
-            # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
-            strlen = self._get_string_width()
-            try:
-                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
-            except UnicodeDecodeError as err:
-                msg = (
-                    f"Character data in variable {self.name!r} could not be decoded "
-                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
-                    "variable '_Encoding' attribute to suit the content."
-                )
-                raise ValueError(msg) from err
-
+        # Create a coding spec : redo every time in case "_Encoding" has changed
+        encoding_spec = VariableEncoder(self._contained_instance)
+        data = encoding_spec.decode_bytes_to_stringarray(data)
         return data
 
     def __setitem__(self, keys, data):
         data = np.asanyarray(data)
-        if self._is_chardata():
-            # N.B. we never need to UNset this, as we totally control it
-            self._contained_instance.set_auto_chartostring(False)
-
-            # N.B. typically, write encoding default is "ascii" --> fails bad content
-            if data.dtype.kind == "U":
-                try:
-                    encoding = (
-                        self._get_encoding(writing=True) or DEFAULT_WRITE_ENCODING
-                    )
-                    strlen = self._get_byte_width()
-                    data = encode_stringarray_as_bytearray(data, encoding, strlen)
-                except UnicodeEncodeError as err:
-                    msg = (
-                        f"String data written to netcdf character variable {self.name!r} "
-                        f"could not be represented in encoding {encoding!r}.  This can be "
-                        "fixed by setting a suitable variable '_Encoding' attribute, "
-                        'e.g. <variable>._Encoding="UTF-8".'
-                    )
-                    raise ValueError(msg) from err
-
+        # Create a coding spec : redo every time in case "_Encoding" has changed
+        encoding_spec = VariableEncoder(self._contained_instance)
+        data = encoding_spec.encode_strings_as_bytearray(data)
         super().__setitem__(keys, data)
 
-    def _is_chardata(self):
-        return np.issubdtype(self.dtype, np.bytes_)
-
-    def _get_encoding(self, writing=False) -> str | None:
-        """Get the byte encoding defined for this variable (or None)."""
-        result = getattr(self, "_Encoding", None)
-        if result is not None:
-            try:
-                # Accept + normalise naming of encodings
-                result = codecs.lookup(result).name
-                # NOTE: if encoding does not suit data, errors can occur.
-                # For example, _Encoding = "ascii", with non-ascii content.
-            except LookupError:
-                # Unrecognised encoding name : handle this as just a warning
-                msg = (
-                    f"Ignoring unknown encoding for variable {self.name!r}: "
-                    f"_Encoding = {result!r}."
-                )
-                warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
-                warnings.warn(msg, category=warntype)
-                # Proceed as if there is no specified encoding
-                result = None
-        return result
-
-    def _get_byte_width(self) -> int | None:
-        if not hasattr(self, "_bytewidth"):
-            n_bytes = self.group().dimensions[self.dimensions[-1]].size
-            # Cache this length control on the variable -- but not as a netcdf attribute
-            self.__dict__["_bytewidth"] = n_bytes
-
-        return self.__dict__["_bytewidth"]
-
-    def _get_string_width(self):
-        """Return the string-length defined for this variable."""
-        if not hasattr(self, "_strlen"):
-            # Work out the actual byte width from the parent dataset dimensions.
-            strlen = self._get_byte_width()
-            # Convert the string dimension length (i.e. bytes) to a sufficiently-long
-            #  string width, depending on the encoding used.
-            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
-            # regularise the name for comparison with recognised ones
-            encoding = codecs.lookup(encoding).name
-            if "utf-16" in encoding:
-                # Each char needs at least 2 bytes -- including a terminator char
-                strlen = (strlen // 2) - 1
-            elif "utf-32" in encoding:
-                # Each char needs exactly 4 bytes -- including a terminator char
-                strlen = (strlen // 4) - 1
-            # "ELSE": assume there can be (at most) as many chars as bytes
-
-            # Cache this length control on the variable -- but not as a netcdf attribute
-            self.__dict__["_strlen"] = strlen
-
-        return self._strlen
-
     def set_auto_chartostring(self, onoff: bool):
         msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
         raise TypeError(msg)
@@ -297,14 +272,37 @@ class EncodedDataset(DatasetWrapper):
 
     VAR_WRAPPER_CLS = EncodedVariable
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
     def set_auto_chartostring(self, onoff: bool):
         msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
         raise TypeError(msg)
 
 
 class EncodedNetCDFDataProxy(NetCDFDataProxy):
-    DATASET_CLASS = EncodedDataset
+    __slots__ = NetCDFDataProxy.__slots__ + ("encoding_details",)
+
+    def __init__(self, cf_var, *args, **kwargs):
+        # When creating, also capture + record the encoding to be performed.
+        kwargs["use_byte_data"] = True
+        super().__init__(cf_var, *args, **kwargs)
+        self.encoding_details = VariableEncoder(cf_var)
+
+    def __getitem__(self, keys):
+        data = super().__getitem__(keys)
+        # Apply the optional bytes-to-strings conversion
+        data = self.encoding_details.decode_bytes_to_stringarray(data)
+        return data
 
 
 class EncodedNetCDFWriteProxy(NetCDFWriteProxy):
-    DATASET_CLASS = EncodedDataset
+    def __init__(self, filepath, cf_var, file_write_lock):
+        super.__init__(filepath, cf_var, file_write_lock)
+        self.encoding_details = VariableEncoder(cf_var)
+
+    def __setitem__(self, key, data):
+        data = np.asanyarray(data)
+        # Apply the optional strings-to-bytes conversion
+        data = self.encoding_details.encode_strings_as_bytearray(data)
+        super.__setitem__(key, data)
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
index cd97452dac..96cee458f7 100644
--- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py
+++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -314,15 +314,22 @@ def fromcdl(cls, *args, **kwargs):
 class NetCDFDataProxy:
     """A reference to the data payload of a single NetCDF file variable."""
 
-    __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value")
-    DATASET_CLASS = netCDF4.Dataset
-
-    def __init__(self, shape, dtype, path, variable_name, fill_value):
-        self.shape = shape
+    __slots__ = (
+        "shape",
+        "dtype",
+        "path",
+        "variable_name",
+        "fill_value",
+        "use_byte_data",
+    )
+
+    def __init__(self, cf_var, dtype, path, fill_value, *, use_byte_data=False):
+        self.shape = cf_var.shape
+        self.variable_name = cf_var.name
         self.dtype = dtype
         self.path = path
-        self.variable_name = variable_name
         self.fill_value = fill_value
+        self.use_byte_data = use_byte_data
 
     @property
     def ndim(self):
@@ -338,9 +345,11 @@ def __getitem__(self, keys):
         # netCDF4 library, presumably because __getitem__ gets called so many
         # times by Dask. Use _GLOBAL_NETCDF4_LOCK directly instead.
         with _GLOBAL_NETCDF4_LOCK:
-            dataset = self.DATASET_CLASS(self.path)
+            dataset = netCDF4.Dataset(self.path)
             try:
                 variable = dataset.variables[self.variable_name]
+                if self.use_byte_data:
+                    variable.set_auto_mask(False)
                 # Get the NetCDF variable data and slice.
                 var = variable[keys]
             finally:
@@ -375,8 +384,6 @@ class NetCDFWriteProxy:
     TODO: could be improved with a caching scheme, but this just about works.
     """
 
-    DATASET_CLASS = netCDF4.Dataset
-
     def __init__(self, filepath, cf_var, file_write_lock):
         self.path = filepath
         self.varname = cf_var.name
@@ -404,7 +411,7 @@ def __setitem__(self, keys, array_data):
                 #  investigation needed.
                 for attempt in range(5):
                     try:
-                        dataset = self.DATASET_CLASS(self.path, "r+")
+                        dataset = netCDF4.Dataset(self.path, "r+")
                         break
                     except OSError:
                         if attempt < 4:

From c4b793604d7128459a2a31e9f233252e547b973e Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 21 Jan 2026 18:41:40 +0000
Subject: [PATCH 29/43] First proper testing (reads working).

---
 lib/iris/fileformats/cf.py                    |  67 +++---
 .../integration/netcdf/test_stringdata.py     | 193 ++++++++++++++++++
 2 files changed, 227 insertions(+), 33 deletions(-)
 create mode 100644 lib/iris/tests/integration/netcdf/test_stringdata.py

diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
index 6e4b8f99e1..ced409f0c1 100644
--- a/lib/iris/fileformats/cf.py
+++ b/lib/iris/fileformats/cf.py
@@ -811,39 +811,40 @@ def cf_label_data(self, cf_data_var):
                 % self.cf_name
             )
 
-        label_data = self[:]
-
-        if ma.isMaskedArray(label_data):
-            label_data = label_data.filled(b"\0")
-
-        # Determine whether we have a string-valued scalar label
-        # i.e. a character variable that only has one dimension (the length of the string).
-        if self.ndim == 1:
-            label_string = b"".join(label_data).strip()
-            label_string = label_string.decode("utf8")
-            data = np.array([label_string])
-        else:
-            # Determine the index of the string dimension.
-            str_dim = self.dimensions.index(str_dim_name)
-
-            # Calculate new label data shape (without string dimension) and create payload array.
-            new_shape = tuple(
-                dim_len for i, dim_len in enumerate(self.shape) if i != str_dim
-            )
-            string_basetype = "|U%d"
-            string_dtype = string_basetype % self.shape[str_dim]
-            data = np.empty(new_shape, dtype=string_dtype)
-
-            for index in np.ndindex(new_shape):
-                # Create the slice for the label data.
-                if str_dim == 0:
-                    label_index = (slice(None, None),) + index
-                else:
-                    label_index = index + (slice(None, None),)
-
-                label_string = b"".join(label_data[label_index]).strip()
-                label_string = label_string.decode("utf8")
-                data[index] = label_string
+        data = self[:]
+        # label_data = self[:]
+        #
+        # if ma.isMaskedArray(label_data):
+        #     label_data = label_data.filled(b"\0")
+        #
+        # # Determine whether we have a string-valued scalar label
+        # # i.e. a character variable that only has one dimension (the length of the string).
+        # if self.ndim == 1:
+        #     label_string = b"".join(label_data).strip()
+        #     label_string = label_string.decode("utf8")
+        #     data = np.array([label_string])
+        # else:
+        #     # Determine the index of the string dimension.
+        #     str_dim = self.dimensions.index(str_dim_name)
+        #
+        #     # Calculate new label data shape (without string dimension) and create payload array.
+        #     new_shape = tuple(
+        #         dim_len for i, dim_len in enumerate(self.shape) if i != str_dim
+        #     )
+        #     string_basetype = "|U%d"
+        #     string_dtype = string_basetype % self.shape[str_dim]
+        #     data = np.empty(new_shape, dtype=string_dtype)
+        #
+        #     for index in np.ndindex(new_shape):
+        #         # Create the slice for the label data.
+        #         if str_dim == 0:
+        #             label_index = (slice(None, None),) + index
+        #         else:
+        #             label_index = index + (slice(None, None),)
+        #
+        #         label_string = b"".join(label_data[label_index]).strip()
+        #         label_string = label_string.decode("utf8")
+        #         data[index] = label_string
 
         return data
 
diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
new file mode 100644
index 0000000000..44c94ac2cc
--- /dev/null
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -0,0 +1,193 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Integration tests for various uses of character/string arrays in netcdf file variables.
+
+This covers both the loading and saving of variables which are the content of
+data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures.
+"""
+
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import iris
+from iris.fileformats.netcdf import _thread_safe_nc
+
+N_XDIM = 3
+N_CHARS_DIM = 64
+COORD_ON_SEPARATE_DIM = True
+PERSIST_TESTFILES = "~/chararray_testfiles"
+
+
+NO_ENCODING_STR = "<noencoding>"
+TEST_ENCODINGS = [
+    NO_ENCODING_STR,
+    "ascii",
+    "utf-8",
+    # "iso8859-1",  # a common one-byte-per-char "codepage" type
+    # "utf-16",
+    "utf-32",
+]
+
+
+#
+# Routines to convert between byte and string arrays.
+# Independently defined here, to avoid relying on any code we are testing.
+#
+def convert_strings_to_chararray(
+    string_array_1d: np.ndarray, maxlen: int, encoding: str | None = None
+):
+    # Note: this is limited to 1-D arrays of strings.
+    # Could generalise that if needed, but for now this makes it simpler.
+    if encoding is None:
+        encoding = "ascii"
+    bbytes = [text.encode(encoding) for text in string_array_1d]
+    pad = b"\0" * maxlen
+    bbytes = [(x + pad)[:maxlen] for x in bbytes]
+    chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
+    return chararray
+
+
+def convert_bytearray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
+def make_testfile(testfile_path: Path, encoding_str: str):
+    """Create a test netcdf file.
+
+    Also returns content strings (unicode or ascii versions).
+    """
+    if encoding_str == NO_ENCODING_STR:
+        encoding = None
+    else:
+        encoding = encoding_str
+
+    data_is_ascii = encoding in (None, "ascii")
+
+    if data_is_ascii:
+        coordvar_strings = ["mOnster", "London", "Amsterdam"]
+        datavar_strings = ["bun", "Eclair", "sandwich"]
+    else:
+        coordvar_strings = ["Münster", "London", "Amsterdam"]
+        datavar_strings = ["bun", "éclair", "sandwich"]
+
+    coordvar_bytearray = convert_strings_to_chararray(
+        string_array_1d=coordvar_strings, maxlen=N_CHARS_DIM, encoding=encoding
+    )
+    datavar_bytearray = convert_strings_to_chararray(
+        string_array_1d=datavar_strings, maxlen=N_CHARS_DIM, encoding=encoding
+    )
+
+    ds = _thread_safe_nc.DatasetWrapper(testfile_path, "w")
+    try:
+        ds.createDimension("x", N_XDIM)
+        ds.createDimension("nstr", N_CHARS_DIM)
+        if COORD_ON_SEPARATE_DIM:
+            ds.createDimension("nstr2", N_CHARS_DIM)
+        v_xdim = ds.createVariable("x", int, dimensions=("x"))
+        v_xdim[:] = np.arange(N_XDIM)
+
+        v_co = ds.createVariable(
+            "v_co",
+            "S1",
+            dimensions=(
+                "x",
+                "nstr2" if COORD_ON_SEPARATE_DIM else "nstr",
+            ),
+        )
+        v_co[:] = coordvar_bytearray
+
+        if encoding is not None:
+            v_co._Encoding = encoding
+
+        v_numeric = ds.createVariable(
+            "v_numeric",
+            float,
+            dimensions=("x",),
+        )
+        v_numeric[:] = np.arange(N_XDIM)
+
+        v_datavar = ds.createVariable(
+            "v",
+            "S1",
+            dimensions=(
+                "x",
+                "nstr",
+            ),
+        )
+        v_datavar[:] = datavar_bytearray
+
+        if encoding is not None:
+            v_datavar._Encoding = encoding
+
+        v_datavar.coordinates = "v_co v_numeric"
+    finally:
+        ds.close()
+
+    return testfile_path, coordvar_strings, datavar_strings
+
+
+@pytest.fixture(params=TEST_ENCODINGS)
+def encoding(request):
+    return request.param
+
+
+class TestReadEncodings:
+    """Test loading of testfiles with encoded string data."""
+
+    @pytest.fixture()
+    def testdata(self, encoding, tmp_path):
+        """Create a suitable valid testfile, and return expected string content."""
+        if PERSIST_TESTFILES:
+            tmp_path = Path(PERSIST_TESTFILES).expanduser()
+        if encoding == "<noencoding>":
+            filetag = "noencoding"
+        else:
+            filetag = encoding
+        tempfile_path = tmp_path / f"sample_read_{filetag}.nc"
+        testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding)
+        from iris.tests.integration.netcdf.test_chararrays import ncdump
+
+        ncdump(tempfile_path)
+        yield testdata
+
+    def assert_no_load_problems(self):
+        if len(iris.loading.LOAD_PROBLEMS.problems):
+            probs = "\n".join(str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems)
+            assert probs == ""
+
+    def test_valid_encodings(self, encoding, testdata):
+        testfile_path, coordvar_strings, datavar_strings = testdata
+        cube = iris.load_cube(testfile_path)
+        self.assert_no_load_problems()
+        assert cube.shape == (N_XDIM,)
+
+        if encoding != "utf-32":
+            expected_string_width = N_CHARS_DIM
+        else:
+            expected_string_width = (N_CHARS_DIM // 4) - 1
+        assert cube.dtype == f"<U{expected_string_width}"
+        assert np.all(cube.data == datavar_strings)
+        coord_var = cube.coord("v_co")
+        assert coord_var.dtype == f"<U{expected_string_width}"
+        assert np.all(coord_var.points == coordvar_strings)

From ac3e687cd2d80816d090d2d945bec098a4192f83 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 23 Jan 2026 15:41:17 +0000
Subject: [PATCH 30/43] Encoded reading ~working; new ideas for switching
 (untested).

---
 .../fileformats/_nc_load_rules/helpers.py     |  8 +--
 lib/iris/fileformats/cf.py                    | 58 ++++++++++---------
 .../netcdf/_bytecoding_datasets.py            | 42 +++++++++++++-
 .../fileformats/netcdf/_thread_safe_nc.py     |  2 +-
 lib/iris/fileformats/netcdf/loader.py         | 36 ++++++++----
 .../integration/netcdf/test_stringdata.py     | 52 +++++++++++++----
 6 files changed, 142 insertions(+), 56 deletions(-)

diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py
index fa63002f09..a2800dc91d 100644
--- a/lib/iris/fileformats/_nc_load_rules/helpers.py
+++ b/lib/iris/fileformats/_nc_load_rules/helpers.py
@@ -1644,11 +1644,11 @@ def _add_auxiliary_coordinate(
     # Determine the name of the dimension/s shared between the CF-netCDF data variable
     # and the coordinate being built.
     coord_dims = cf_coord_var.dimensions
-    if cf._is_str_dtype(cf_coord_var):
-        coord_dims = coord_dims[:-1]
+    # if cf._is_str_dtype(cf_coord_var):
+    #     coord_dims = coord_dims[:-1]
     datavar_dims = engine.cf_var.dimensions
-    if cf._is_str_dtype(engine.cf_var):
-        datavar_dims = datavar_dims[:-1]
+    # if cf._is_str_dtype(engine.cf_var):
+    #     datavar_dims = datavar_dims[:-1]
     common_dims = [dim for dim in coord_dims if dim in datavar_dims]
     data_dims = None
     if common_dims:
diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
index ced409f0c1..d32afaacb5 100644
--- a/lib/iris/fileformats/cf.py
+++ b/lib/iris/fileformats/cf.py
@@ -26,7 +26,7 @@
 
 import iris.exceptions
 import iris.fileformats._nc_load_rules.helpers as hh
-from iris.fileformats.netcdf import _bytecoding_datasets
+from iris.fileformats.netcdf import _bytecoding_datasets, _thread_safe_nc
 from iris.mesh.components import Connectivity
 import iris.util
 import iris.warnings
@@ -67,7 +67,9 @@
 
 # NetCDF returns a different type for strings depending on Python version.
 def _is_str_dtype(var):
-    return np.issubdtype(var.dtype, np.bytes_)
+    # N.B. use 'datatype' not 'dtype', to "look inside" variable wrappers which
+    #  represent 'S1' type data as 'U<xx>'.
+    return isinstance(var.datatype, np.dtype) and np.issubdtype(var.datatype, np.bytes_)
 
 
 ################################################################################
@@ -788,28 +790,28 @@ def cf_label_data(self, cf_data_var):
                 % type(cf_data_var)
             )
 
-        # Determine the name of the label string (or length) dimension by
-        # finding the dimension name that doesn't exist within the data dimensions.
-        str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
-        n_nondata_dims = len(str_dim_names)
-
-        if n_nondata_dims == 0:
-            # *All* dims are shared with the data-variable.
-            # This is only ok if the data-var is *also* a string type.
-            dim_ok = _is_str_dtype(cf_data_var)
-            # In this case, we must just *assume* that the last dimension is "the"
-            #  string dimension
-            str_dim_name = self.dimensions[-1]
-        else:
-            # If there is exactly one non-data dim, that is the one we want
-            dim_ok = len(str_dim_names) == 1
-            (str_dim_name,) = str_dim_names
-
-        if not dim_ok:
-            raise ValueError(
-                "Invalid string dimensions for CF-netCDF label variable %r"
-                % self.cf_name
-            )
+        # # Determine the name of the label string (or length) dimension by
+        # # finding the dimension name that doesn't exist within the data dimensions.
+        # str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions))
+        # n_nondata_dims = len(str_dim_names)
+        #
+        # if n_nondata_dims == 0:
+        #     # *All* dims are shared with the data-variable.
+        #     # This is only ok if the data-var is *also* a string type.
+        #     dim_ok = _is_str_dtype(cf_data_var)
+        #     # In this case, we must just *assume* that the last dimension is "the"
+        #     #  string dimension
+        #     str_dim_name = self.dimensions[-1]
+        # else:
+        #     # If there is exactly one non-data dim, that is the one we want
+        #     dim_ok = len(str_dim_names) == 1
+        #     (str_dim_name,) = str_dim_names
+        #
+        # if not dim_ok:
+        #     raise ValueError(
+        #         "Invalid string dimensions for CF-netCDF label variable %r"
+        #         % self.cf_name
+        #     )
 
         data = self[:]
         # label_data = self[:]
@@ -1374,9 +1376,11 @@ def __init__(self, file_source, warn=False, monotonic=False):
         if isinstance(file_source, str):
             # Create from filepath : open it + own it (=close when we die).
             self._filename = os.path.expanduser(file_source)
-            self._dataset = _bytecoding_datasets.EncodedDataset(
-                self._filename, mode="r"
-            )
+            if _bytecoding_datasets.DECODE_TO_STRINGS_ON_READ:
+                ds_type = _bytecoding_datasets.EncodedDataset
+            else:
+                ds_type = _thread_safe_nc.DatasetWrapper
+            self._dataset = ds_type(self._filename, mode="r")
             self._own_file = True
         else:
             # We have been passed an open dataset.
diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 4559f4b78b..fa64e570bb 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -182,7 +182,7 @@ def _get_string_width(self, cf_var) -> int:
         return strlen
 
     def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
-        if self.is_chardata and DECODE_TO_STRINGS_ON_READ:
+        if self.is_chardata:
             # N.B. read encoding default is UTF-8 --> a "usually safe" choice
             encoding = self.read_encoding
             strlen = self.string_width
@@ -247,6 +247,38 @@ class EncodedVariable(VariableWrapper):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+    # Override specific properties of the contained instance, making changes in the case
+    # that the variable contains char data, which is presented instead as strings
+    # with one less dimension.
+
+    @property
+    def shape(self):
+        shape = self._contained_instance.shape
+        is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_)
+        if is_chardata:
+            # Translated char data appears without the final dimension
+            shape = shape[:-1]  # remove final dimension
+        return shape
+
+    @property
+    def dimensions(self):
+        dimensions = self._contained_instance.dimensions
+        is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_)
+        if is_chardata:
+            # Translated char data appears without the final dimension
+            dimensions = dimensions[:-1]  # remove final dimension
+        return dimensions
+
+    @property
+    def dtype(self):
+        dtype = self._contained_instance.dtype
+        is_chardata = np.issubdtype(self._contained_instance.dtype, np.bytes_)
+        if is_chardata:
+            # Create a coding spec : redo every time in case "_Encoding" has changed
+            encoding_spec = VariableEncoder(self._contained_instance)
+            dtype = np.dtype(f"U{encoding_spec.string_width}")
+        return dtype
+
     def __getitem__(self, keys):
         self._contained_instance.set_auto_chartostring(False)
         data = super().__getitem__(keys)
@@ -287,7 +319,13 @@ def __init__(self, cf_var, *args, **kwargs):
         # When creating, also capture + record the encoding to be performed.
         kwargs["use_byte_data"] = True
         super().__init__(cf_var, *args, **kwargs)
-        self.encoding_details = VariableEncoder(cf_var)
+        if not isinstance(cf_var, EncodedVariable):
+            msg = (
+                f"Unexpected variable type : {type(cf_var)} of variable '{cf_var.name}'"
+                ": expected EncodedVariable."
+            )
+            raise TypeError(msg)
+        self.encoding_details = VariableEncoder(cf_var._contained_instance)
 
     def __getitem__(self, keys):
         data = super().__getitem__(keys)
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
index 96cee458f7..f96312cf79 100644
--- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py
+++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -349,7 +349,7 @@ def __getitem__(self, keys):
             try:
                 variable = dataset.variables[self.variable_name]
                 if self.use_byte_data:
-                    variable.set_auto_mask(False)
+                    variable.set_auto_chartostring(False)
                 # Get the NetCDF variable data and slice.
                 var = variable[keys]
             finally:
diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
index d363e29738..9607b393d9 100644
--- a/lib/iris/fileformats/netcdf/loader.py
+++ b/lib/iris/fileformats/netcdf/loader.py
@@ -50,6 +50,10 @@
 
 # An expected part of the public loader API, but includes thread safety
 #  concerns so is housed in _thread_safe_nc.
+# NOTE: this is the *default*, as required for public legacy api
+#  - in practice, when creating our proxies we dynamically choose between this and
+#    :class:`_thread_safe_nc.DatasetWrapper`, depending on
+#    :data:`_bytecoding_datasets.DECODE_TO_STRINGS_ON_READ`
 NetCDFDataProxy = _bytecoding_datasets.EncodedNetCDFDataProxy
 
 
@@ -279,7 +283,7 @@ def _get_cf_var_data(cf_var):
             # correct dtype. Note: this is not an issue for masked arrays,
             # only masked scalar values.
             if result is np.ma.masked:
-                result = np.ma.masked_all(1, dtype=cf_var.datatype)
+                result = np.ma.masked_all(1, dtype=cf_var.dtype)
         else:
             # Get lazy chunked data out of a cf variable.
             # Creates Dask wrappers around data arrays for any cube components which
@@ -289,15 +293,27 @@ def _get_cf_var_data(cf_var):
             # Make a data-proxy that mimics array access and can fetch from the file.
             # Note: Special handling needed for "variable length string" types which
             # return a dtype of `str`, rather than a numpy type; use `S1` in this case.
-            fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:]
-            fill_value = getattr(
-                cf_var.cf_data,
-                "_FillValue",
-                _thread_safe_nc.default_fillvals[fill_dtype],
-            )
-            proxy = NetCDFDataProxy(
-                cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value
-            )
+            if cf_var.dtype.kind == "U":
+                # Special handling for "string variables".
+                fill_value = ""
+            else:
+                fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:]
+                fill_value = getattr(
+                    cf_var.cf_data,
+                    "_FillValue",
+                    _thread_safe_nc.default_fillvals[fill_dtype],
+                )
+
+            # Switch type of proxy, based on type of variable.
+            # It is done this way, instead of using an instance variable, because the
+            #  limited nature of the wrappers makes a stateful choice awkward,
+            #  e.g. especially, "variable.group()" is *not* the parent DatasetWrapper.
+            if isinstance(cf_var.cf_data, _bytecoding_datasets.EncodedVariable):
+                proxy_class = _bytecoding_datasets.EncodedNetCDFDataProxy
+            else:
+                proxy_class = _thread_safe_nc.NetCDFDataProxy
+
+            proxy = proxy_class(cf_var.cf_data, dtype, cf_var.filename, fill_value)
             # Get the chunking specified for the variable : this is either a shape, or
             # maybe the string "contiguous".
             if CHUNK_CONTROL.mode is ChunkControl.Modes.AS_DASK:
diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
index 44c94ac2cc..5831f85b41 100644
--- a/lib/iris/tests/integration/netcdf/test_stringdata.py
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -8,6 +8,7 @@
 data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures.
 """
 
+from dataclasses import dataclass
 from pathlib import Path
 
 import numpy as np
@@ -16,9 +17,12 @@
 import iris
 from iris.fileformats.netcdf import _thread_safe_nc
 
+iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = 0
+
 N_XDIM = 3
 N_CHARS_DIM = 64
-COORD_ON_SEPARATE_DIM = True
+# COORD_ON_SEPARATE_DIM = True
+COORD_ON_SEPARATE_DIM = False
 PERSIST_TESTFILES = "~/chararray_testfiles"
 
 
@@ -72,6 +76,14 @@ def convert_bytearray_to_strings(
     return result
 
 
+@dataclass
+class SamplefileDetails:
+    filepath: Path
+    datavar_data: np.ndarray
+    stringcoord_data: np.ndarray
+    numericcoord_data: np.ndarray
+
+
 def make_testfile(testfile_path: Path, encoding_str: str):
     """Create a test netcdf file.
 
@@ -84,6 +96,7 @@ def make_testfile(testfile_path: Path, encoding_str: str):
 
     data_is_ascii = encoding in (None, "ascii")
 
+    numeric_values = np.arange(3.0)
     if data_is_ascii:
         coordvar_strings = ["mOnster", "London", "Amsterdam"]
         datavar_strings = ["bun", "Eclair", "sandwich"]
@@ -125,7 +138,7 @@ def make_testfile(testfile_path: Path, encoding_str: str):
             float,
             dimensions=("x",),
         )
-        v_numeric[:] = np.arange(N_XDIM)
+        v_numeric[:] = numeric_values
 
         v_datavar = ds.createVariable(
             "v",
@@ -144,7 +157,12 @@ def make_testfile(testfile_path: Path, encoding_str: str):
     finally:
         ds.close()
 
-    return testfile_path, coordvar_strings, datavar_strings
+    return SamplefileDetails(
+        filepath=testfile_path,
+        datavar_data=datavar_strings,
+        stringcoord_data=coordvar_strings,
+        numericcoord_data=numeric_values,
+    )
 
 
 @pytest.fixture(params=TEST_ENCODINGS)
@@ -152,6 +170,10 @@ def encoding(request):
     return request.param
 
 
+def load_problems_list():
+    return [str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems]
+
+
 class TestReadEncodings:
     """Test loading of testfiles with encoded string data."""
 
@@ -168,18 +190,19 @@ def testdata(self, encoding, tmp_path):
         testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding)
         from iris.tests.integration.netcdf.test_chararrays import ncdump
 
+        # TODO: temporary for debug -- TO REMOVE
         ncdump(tempfile_path)
         yield testdata
 
-    def assert_no_load_problems(self):
-        if len(iris.loading.LOAD_PROBLEMS.problems):
-            probs = "\n".join(str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems)
-            assert probs == ""
-
-    def test_valid_encodings(self, encoding, testdata):
-        testfile_path, coordvar_strings, datavar_strings = testdata
+    def test_valid_encodings(self, encoding, testdata: SamplefileDetails):
+        testfile_path, datavar_strings, coordvar_strings, numeric_data = (
+            testdata.filepath,
+            testdata.datavar_data,
+            testdata.stringcoord_data,
+            testdata.numericcoord_data,
+        )
         cube = iris.load_cube(testfile_path)
-        self.assert_no_load_problems()
+        assert load_problems_list() == []
         assert cube.shape == (N_XDIM,)
 
         if encoding != "utf-32":
@@ -187,7 +210,12 @@ def test_valid_encodings(self, encoding, testdata):
         else:
             expected_string_width = (N_CHARS_DIM // 4) - 1
         assert cube.dtype == f"<U{expected_string_width}"
-        assert np.all(cube.data == datavar_strings)
+        cube_data = cube.data
+        assert np.all(cube_data == datavar_strings)
         coord_var = cube.coord("v_co")
         assert coord_var.dtype == f"<U{expected_string_width}"
         assert np.all(coord_var.points == coordvar_strings)
+        # Also check the numeric one.
+        coord_var_2 = cube.coord("v_numeric")
+        assert coord_var_2.dtype == np.float64
+        assert np.all(coord_var_2.points == numeric_data)

From 9ec31fb9ce766394d4d95b172aa0158032e8e332 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Tue, 27 Jan 2026 11:37:10 +0000
Subject: [PATCH 31/43] Check loads when coords do/not share a string dim with
 data.

---
 .../integration/netcdf/test_stringdata.py     | 47 +++++++++++++++----
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
index 5831f85b41..fc5bf5ae3e 100644
--- a/lib/iris/tests/integration/netcdf/test_stringdata.py
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -8,6 +8,7 @@
 data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures.
 """
 
+from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -17,12 +18,18 @@
 import iris
 from iris.fileformats.netcdf import _thread_safe_nc
 
-iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = 0
+
+@pytest.fixture(scope="module")
+def all_lazy_auxcoords():
+    """Ensure that *all* aux-coords are loaded lazily, even really small ones."""
+    old_minlazybytes = iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES
+    iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = 0
+    yield
+    iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES = old_minlazybytes
+
 
 N_XDIM = 3
 N_CHARS_DIM = 64
-# COORD_ON_SEPARATE_DIM = True
-COORD_ON_SEPARATE_DIM = False
 PERSIST_TESTFILES = "~/chararray_testfiles"
 
 
@@ -78,16 +85,22 @@ def convert_bytearray_to_strings(
 
 @dataclass
 class SamplefileDetails:
+    """Convenience container for information about a sample file."""
+
     filepath: Path
     datavar_data: np.ndarray
     stringcoord_data: np.ndarray
     numericcoord_data: np.ndarray
 
 
-def make_testfile(testfile_path: Path, encoding_str: str):
+def make_testfile(
+    testfile_path: Path,
+    encoding_str: str,
+    coords_on_separate_dim: bool,
+) -> SamplefileDetails:
     """Create a test netcdf file.
 
-    Also returns content strings (unicode or ascii versions).
+    Also returns content information for checking loaded results.
     """
     if encoding_str == NO_ENCODING_STR:
         encoding = None
@@ -115,7 +128,7 @@ def make_testfile(testfile_path: Path, encoding_str: str):
     try:
         ds.createDimension("x", N_XDIM)
         ds.createDimension("nstr", N_CHARS_DIM)
-        if COORD_ON_SEPARATE_DIM:
+        if coords_on_separate_dim:
             ds.createDimension("nstr2", N_CHARS_DIM)
         v_xdim = ds.createVariable("x", int, dimensions=("x"))
         v_xdim[:] = np.arange(N_XDIM)
@@ -125,7 +138,7 @@ def make_testfile(testfile_path: Path, encoding_str: str):
             "S1",
             dimensions=(
                 "x",
-                "nstr2" if COORD_ON_SEPARATE_DIM else "nstr",
+                "nstr2" if coords_on_separate_dim else "nstr",
             ),
         )
         v_co[:] = coordvar_bytearray
@@ -177,8 +190,17 @@ def load_problems_list():
 class TestReadEncodings:
     """Test loading of testfiles with encoded string data."""
 
+    @pytest.fixture(params=["coordsSameDim", "coordsOwnDim"])
+    def use_separate_dims(self, request):
+        yield request.param == "coordsOwnDim"
+
     @pytest.fixture()
-    def testdata(self, encoding, tmp_path):
+    def testdata(
+        self,
+        encoding,
+        tmp_path,
+        use_separate_dims,
+    ):
         """Create a suitable valid testfile, and return expected string content."""
         if PERSIST_TESTFILES:
             tmp_path = Path(PERSIST_TESTFILES).expanduser()
@@ -186,8 +208,13 @@ def testdata(self, encoding, tmp_path):
             filetag = "noencoding"
         else:
             filetag = encoding
-        tempfile_path = tmp_path / f"sample_read_{filetag}.nc"
-        testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding)
+        dimtag = "diffdims" if use_separate_dims else "samedims"
+        tempfile_path = tmp_path / f"sample_read_{filetag}_{dimtag}.nc"
+        testdata = make_testfile(
+            testfile_path=tempfile_path,
+            encoding_str=encoding,
+            coords_on_separate_dim=use_separate_dims,
+        )
         from iris.tests.integration.netcdf.test_chararrays import ncdump
 
         # TODO: temporary for debug -- TO REMOVE

From 9bdeb5d5012fa19a4eb5ee50a782694163db84cf Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Tue, 27 Jan 2026 15:32:36 +0000
Subject: [PATCH 32/43] Fix nondecoded reference loads in
 test_byecoded_datasets.

---
 .../netcdf/test_bytecoding_datasets.py        | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 4909d976de..f16097bef3 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -304,6 +304,14 @@ class TestRead:
     def readmode(self, request):
         return request.param
 
+    def undecoded_testvar(self, ds_encoded, varname: str):
+        path = ds_encoded.filepath()
+        ds_encoded.close()
+        ds = DatasetWrapper(path)
+        v = ds.variables[varname]
+        v.set_auto_chartostring(False)
+        return v
+
     def test_encodings(self, encoding, tempdir, readmode):
         # Create a dataset with the variable
         path = tempdir / f"test_read_encodings_{encoding!s}_{readmode}.nc"
@@ -337,9 +345,9 @@ def test_encodings(self, encoding, tempdir, readmode):
                 assert np.all(truncated_result == result)
                 result = truncated_result
         else:
-            # Test "raw" read --> byte array
-            with DECODE_TO_STRINGS_ON_READ.context(False):
-                result = v[:]
+            # Close and re-open as "regular" dataset -- just to check the raw content
+            v = self.undecoded_testvar(ds_encoded, "vxs")
+            result = v[:]
             expected = write_bytes
 
         check_array_matching(result, expected)
@@ -364,8 +372,8 @@ def test_scalar(self, tempdir, readmode):
             expected = np.array(data_string)
         else:
             # Test "raw" read --> byte array
-            with DECODE_TO_STRINGS_ON_READ.context(False):
-                result = v[:]
+            v = self.undecoded_testvar(ds_encoded, "v0_scalar")
+            result = v[:]
             expected = data_bytes
 
         check_array_matching(result, expected)
@@ -401,8 +409,8 @@ def test_multidim(self, tempdir, readmode):
             expected = np.array(test_strings)
         else:
             # Test "raw" read --> byte array
-            with DECODE_TO_STRINGS_ON_READ.context(False):
-                result = v[:]
+            v = self.undecoded_testvar(ds_encoded, "vyxn")
+            result = v[:]
             expected = test_bytes
 
         check_array_matching(result, expected)
@@ -410,8 +418,8 @@ def test_multidim(self, tempdir, readmode):
     def test_read_encoding_failure(self, tempdir, readmode):
         path = tempdir / f"test_read_encoding_failure_{readmode}.nc"
         strlen = 10
-        ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
-        v = ds.variables["vxs"]
+        ds_encoded = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
+        v = ds_encoded.variables["vxs"]
         test_utf8_bytes = make_bytearray(
             samples_3_nonascii, bytewidth=strlen, encoding="utf-8"
         )
@@ -425,8 +433,8 @@ def test_read_encoding_failure(self, tempdir, readmode):
             with pytest.raises(ValueError, match=msg):
                 v[:]
         else:
-            with DECODE_TO_STRINGS_ON_READ.context(False):
-                result = v[:]  # this ought to be ok!
+            v = self.undecoded_testvar(ds_encoded, "vxs")
+            result = v[:]  # this ought to be ok!
 
             assert np.all(result == test_utf8_bytes)
 

From 54d7743a5d1cfc1b12e07865c127bc1407b9b4db Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Tue, 27 Jan 2026 18:49:23 +0000
Subject: [PATCH 33/43] Test writing of string data: various encodings, from
 strings or bytes.

---
 .../integration/netcdf/test_stringdata.py     | 165 ++++++++++++++++--
 1 file changed, 155 insertions(+), 10 deletions(-)

diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
index fc5bf5ae3e..ed6fa576df 100644
--- a/lib/iris/tests/integration/netcdf/test_stringdata.py
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -8,14 +8,17 @@
 data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures.
 """
 
-from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Iterable
 
 import numpy as np
+from numpy.typing import ArrayLike
 import pytest
 
 import iris
+from iris.coords import AuxCoord, DimCoord
+from iris.cube import Cube
 from iris.fileformats.netcdf import _thread_safe_nc
 
 
@@ -49,8 +52,8 @@ def all_lazy_auxcoords():
 # Independently defined here, to avoid relying on any code we are testing.
 #
 def convert_strings_to_chararray(
-    string_array_1d: np.ndarray, maxlen: int, encoding: str | None = None
-):
+    string_array_1d: ArrayLike, maxlen: int, encoding: str | None = None
+) -> np.ndarray:
     # Note: this is limited to 1-D arrays of strings.
     # Could generalise that if needed, but for now this makes it simpler.
     if encoding is None:
@@ -63,12 +66,13 @@ def convert_strings_to_chararray(
 
 
 def convert_bytearray_to_strings(
-    byte_array, encoding="utf-8", string_length: int | None = None
-):
+    byte_array: ArrayLike, encoding: str = "utf-8", string_length: int | None = None
+) -> np.ndarray:
     """Convert bytes to strings.
 
     N.B. for now at least, we assume the string dim is **always the last one**.
     """
+    byte_array = np.asanyarray(byte_array)
     bytes_shape = byte_array.shape
     var_shape = bytes_shape[:-1]
     if string_length is None:
@@ -88,9 +92,9 @@ class SamplefileDetails:
     """Convenience container for information about a sample file."""
 
     filepath: Path
-    datavar_data: np.ndarray
-    stringcoord_data: np.ndarray
-    numericcoord_data: np.ndarray
+    datavar_data: ArrayLike
+    stringcoord_data: ArrayLike
+    numericcoord_data: ArrayLike
 
 
 def make_testfile(
@@ -200,7 +204,7 @@ def testdata(
         encoding,
         tmp_path,
         use_separate_dims,
-    ):
+    ) -> Iterable[SamplefileDetails]:
         """Create a suitable valid testfile, and return expected string content."""
         if PERSIST_TESTFILES:
             tmp_path = Path(PERSIST_TESTFILES).expanduser()
@@ -218,7 +222,7 @@ def testdata(
         from iris.tests.integration.netcdf.test_chararrays import ncdump
 
         # TODO: temporary for debug -- TO REMOVE
-        ncdump(tempfile_path)
+        ncdump(str(tempfile_path))
         yield testdata
 
     def test_valid_encodings(self, encoding, testdata: SamplefileDetails):
@@ -246,3 +250,144 @@ def test_valid_encodings(self, encoding, testdata: SamplefileDetails):
         coord_var_2 = cube.coord("v_numeric")
         assert coord_var_2.dtype == np.float64
         assert np.all(coord_var_2.points == numeric_data)
+
+
+@pytest.fixture(params=["stringdata", "bytedata"])
+def as_bytes(request):
+    yield request.param == "bytedata"
+
+
+@dataclass
+class SampleCubeDetails:
+    cube: Cube
+    datavar_data: np.ndarray
+    stringcoord_data: np.ndarray
+    save_path: str | Path | None = None
+
+
+def make_testcube(
+    encoding_str: str | None = None,
+    byte_data: bool = False,
+) -> SampleCubeDetails:
+    data_is_ascii = encoding_str in (NO_ENCODING_STR, "ascii")
+
+    numeric_values = np.arange(3.0)
+    if data_is_ascii:
+        coordvar_strings = ["mOnster", "London", "Amsterdam"]
+        datavar_strings = ["bun", "Eclair", "sandwich"]
+    else:
+        coordvar_strings = ["Münster", "London", "Amsterdam"]
+        datavar_strings = ["bun", "éclair", "sandwich"]
+
+    if not byte_data:
+        charlen = N_CHARS_DIM
+        if encoding_str == "utf-32":
+            charlen = charlen // 4 - 1
+        strings_dtype = np.dtype(f"U{charlen}")
+        coordvar_array = np.array(coordvar_strings, dtype=strings_dtype)
+        datavar_array = np.array(datavar_strings, dtype=strings_dtype)
+    else:
+        write_encoding = encoding_str
+        if write_encoding == NO_ENCODING_STR:
+            write_encoding = "ascii"
+        coordvar_array = convert_strings_to_chararray(
+            coordvar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding
+        )
+        datavar_array = convert_strings_to_chararray(
+            datavar_strings, maxlen=N_CHARS_DIM, encoding=write_encoding
+        )
+
+    cube = Cube(datavar_array, var_name="v")
+    cube.add_dim_coord(DimCoord(np.arange(N_XDIM), var_name="x"), 0)
+    if encoding_str != NO_ENCODING_STR:
+        cube.attributes["_Encoding"] = encoding_str
+    co_x = AuxCoord(coordvar_array, var_name="v_co")
+    if encoding_str != NO_ENCODING_STR:
+        co_x.attributes["_Encoding"] = encoding_str
+    co_dims = (0, 1) if byte_data else (0,)
+    cube.add_aux_coord(co_x, co_dims)
+
+    result = SampleCubeDetails(
+        cube=cube,
+        datavar_data=datavar_array,
+        stringcoord_data=coordvar_array,
+    )
+    return result
+
+
+class TestWriteEncodings:
+    """Test saving of testfiles with encoded string data.
+
+    To avoid circularity, we generate and save *cube* data.
+    """
+
+    @pytest.fixture(params=["dataAsStrings", "dataAsBytes"])
+    def write_bytes(self, request):
+        yield request.param == "dataAsBytes"
+
+    @pytest.fixture()
+    def testpath(self, encoding, write_bytes, tmp_path):
+        """Create a suitable test cube, with either string or byte content."""
+        if PERSIST_TESTFILES:
+            tmp_path = Path(PERSIST_TESTFILES).expanduser()
+        if encoding == "<noencoding>":
+            filetag = "noencoding"
+        else:
+            filetag = encoding
+        datatag = "writebytes" if write_bytes else "writestrings"
+        tempfile_path = tmp_path / f"sample_write_{filetag}_{datatag}.nc"
+        yield tempfile_path
+
+    @pytest.fixture()
+    def testdata(self, testpath, encoding, write_bytes):
+        """Create a suitable test cube + save to a file.
+
+        Apply the given encoding to both coord and cube data.
+        Form the data as bytes, or as strings, depending on 'write_bytes'.'
+        """
+        cube_info = make_testcube(encoding_str=encoding, byte_data=write_bytes)
+        cube_info.save_path = testpath
+        cube = cube_info.cube
+        iris.save(cube, testpath)
+        yield cube_info
+
+    def test_valid_encodings(self, encoding, testdata, write_bytes):
+        cube_info = testdata
+        cube, path = cube_info.cube, cube_info.save_path
+        # TODO: not testing the "byte read/write" yet
+        # Make a quick check for cube equality : but the presentation depends on the read mode
+        # with DECODE_TO_STRINGS_ON_READ.context(not write_bytes):
+        # read_cube = iris.load_cube(path)
+        # assert read_cube == cube
+
+        # N.B. file content should not depend on whether bytes or strings were written
+        vararray, coordarray = cube_info.datavar_data, cube_info.stringcoord_data
+        ds = _thread_safe_nc.DatasetWrapper(path)
+        ds.set_auto_chartostring(False)
+        v_main = ds.variables["v"]
+        v_co = ds.variables["v_co"]
+        assert v_main.shape == (N_XDIM, N_CHARS_DIM)
+        assert v_co.shape == (N_XDIM, N_CHARS_DIM)
+        assert v_main.dtype == "<S1"
+        assert v_co.dtype == "<S1"
+        if encoding == NO_ENCODING_STR:
+            assert not "_Encoding" in v_main.ncattrs()
+            assert not "_Encoding" in v_co.ncattrs()
+        else:
+            assert v_main.getncattr("_Encoding") == encoding
+            assert v_co.getncattr("_Encoding") == encoding
+        data_main = v_main[:]
+        data_co = v_co[:]
+        if not write_bytes:
+            # convert to strings, to compare with originals
+            # ("ELSE": varrray/coordarray are bytes anyway)
+            if encoding == NO_ENCODING_STR:
+                encoding = "ascii"
+            data_main = convert_bytearray_to_strings(
+                data_main, encoding, string_length=N_CHARS_DIM
+            )
+            data_co = convert_bytearray_to_strings(
+                data_co, encoding, string_length=N_CHARS_DIM
+            )
+        assert np.all(data_main == vararray)
+        assert np.all(data_co == coordarray)

From 6a37f62640feef4f88a44808efa802fa132500d3 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 28 Jan 2026 14:39:59 +0000
Subject: [PATCH 34/43] Fix write proxy; tmp_path in stringdata tests; tidy
 stringdata tests.

---
 .../netcdf/_bytecoding_datasets.py            |  4 +-
 .../integration/netcdf/test_stringdata.py     | 54 ++++++++++++-------
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index fa64e570bb..59ad639634 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -336,11 +336,11 @@ def __getitem__(self, keys):
 
 class EncodedNetCDFWriteProxy(NetCDFWriteProxy):
     def __init__(self, filepath, cf_var, file_write_lock):
-        super.__init__(filepath, cf_var, file_write_lock)
+        super().__init__(filepath, cf_var, file_write_lock)
         self.encoding_details = VariableEncoder(cf_var)
 
     def __setitem__(self, key, data):
         data = np.asanyarray(data)
         # Apply the optional strings-to-bytes conversion
         data = self.encoding_details.encode_strings_as_bytearray(data)
-        super.__setitem__(key, data)
+        super().__setitem__(key, data)
diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
index ed6fa576df..bc308d474c 100644
--- a/lib/iris/tests/integration/netcdf/test_stringdata.py
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -33,8 +33,9 @@ def all_lazy_auxcoords():
 
 N_XDIM = 3
 N_CHARS_DIM = 64
-PERSIST_TESTFILES = "~/chararray_testfiles"
-
+# TODO: remove (debug)
+# PERSIST_TESTFILES: str | None = "~/chararray_testfiles"
+PERSIST_TESTFILES: str | None = None
 
 NO_ENCODING_STR = "<noencoding>"
 TEST_ENCODINGS = [
@@ -199,38 +200,51 @@ def use_separate_dims(self, request):
         yield request.param == "coordsOwnDim"
 
     @pytest.fixture()
-    def testdata(
+    def readtest_path(
         self,
         encoding,
         tmp_path,
         use_separate_dims,
     ) -> Iterable[SamplefileDetails]:
         """Create a suitable valid testfile, and return expected string content."""
-        if PERSIST_TESTFILES:
-            tmp_path = Path(PERSIST_TESTFILES).expanduser()
+        match PERSIST_TESTFILES:
+            case str():
+                tmp_path = Path(PERSIST_TESTFILES).expanduser()
+            case _:
+                pass
         if encoding == "<noencoding>":
             filetag = "noencoding"
         else:
             filetag = encoding
         dimtag = "diffdims" if use_separate_dims else "samedims"
         tempfile_path = tmp_path / f"sample_read_{filetag}_{dimtag}.nc"
+        yield tempfile_path
+
+    @pytest.fixture()
+    def readtest_data(
+        self,
+        encoding,
+        readtest_path,
+        use_separate_dims,
+    ) -> Iterable[SamplefileDetails]:
+        """Create a suitable valid testfile, and return expected string content."""
         testdata = make_testfile(
-            testfile_path=tempfile_path,
+            testfile_path=readtest_path,
             encoding_str=encoding,
             coords_on_separate_dim=use_separate_dims,
         )
-        from iris.tests.integration.netcdf.test_chararrays import ncdump
 
-        # TODO: temporary for debug -- TO REMOVE
-        ncdump(str(tempfile_path))
+        # # TODO: temporary for debug -- TO REMOVE
+        # from iris.tests.integration.netcdf.test_chararrays import ncdump
+        # ncdump(str(tempfile_path))
         yield testdata
 
-    def test_valid_encodings(self, encoding, testdata: SamplefileDetails):
+    def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails):
         testfile_path, datavar_strings, coordvar_strings, numeric_data = (
-            testdata.filepath,
-            testdata.datavar_data,
-            testdata.stringcoord_data,
-            testdata.numericcoord_data,
+            readtest_data.filepath,
+            readtest_data.datavar_data,
+            readtest_data.stringcoord_data,
+            readtest_data.numericcoord_data,
         )
         cube = iris.load_cube(testfile_path)
         assert load_problems_list() == []
@@ -326,7 +340,7 @@ def write_bytes(self, request):
         yield request.param == "dataAsBytes"
 
     @pytest.fixture()
-    def testpath(self, encoding, write_bytes, tmp_path):
+    def writetest_path(self, encoding, write_bytes, tmp_path):
         """Create a suitable test cube, with either string or byte content."""
         if PERSIST_TESTFILES:
             tmp_path = Path(PERSIST_TESTFILES).expanduser()
@@ -339,20 +353,20 @@ def testpath(self, encoding, write_bytes, tmp_path):
         yield tempfile_path
 
     @pytest.fixture()
-    def testdata(self, testpath, encoding, write_bytes):
+    def writetest_data(self, writetest_path, encoding, write_bytes):
         """Create a suitable test cube + save to a file.
 
         Apply the given encoding to both coord and cube data.
         Form the data as bytes, or as strings, depending on 'write_bytes'.'
         """
         cube_info = make_testcube(encoding_str=encoding, byte_data=write_bytes)
-        cube_info.save_path = testpath
+        cube_info.save_path = writetest_path
         cube = cube_info.cube
-        iris.save(cube, testpath)
+        iris.save(cube, writetest_path)
         yield cube_info
 
-    def test_valid_encodings(self, encoding, testdata, write_bytes):
-        cube_info = testdata
+    def test_valid_encodings(self, encoding, writetest_data, write_bytes):
+        cube_info = writetest_data
         cube, path = cube_info.cube, cube_info.save_path
         # TODO: not testing the "byte read/write" yet
         # Make a quick check for cube equality : but the presentation depends on the read mode

From cf9594b2110a1fdae4f3462e18399b899ef191b4 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 28 Jan 2026 14:58:42 +0000
Subject: [PATCH 35/43] Fix for non-string data.

---
 lib/iris/fileformats/netcdf/_bytecoding_datasets.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 59ad639634..22a9011eec 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -132,10 +132,11 @@ def __init__(self, cf_var):
         self.varname = cf_var.name
         self.dtype = cf_var.dtype
         self.is_chardata = np.issubdtype(self.dtype, np.bytes_)
-        self.read_encoding = self._get_encoding(cf_var, writing=False)
-        self.write_encoding = self._get_encoding(cf_var, writing=True)
-        self.n_chars_dim = cf_var.group().dimensions[cf_var.dimensions[-1]].size
-        self.string_width = self._get_string_width(cf_var)
+        if self.is_chardata:
+            self.read_encoding = self._get_encoding(cf_var, writing=False)
+            self.write_encoding = self._get_encoding(cf_var, writing=True)
+            self.n_chars_dim = cf_var.group().dimensions[cf_var.dimensions[-1]].size
+            self.string_width = self._get_string_width(cf_var)
 
     @staticmethod
     def _get_encoding(cf_var, writing=False) -> str:
@@ -199,7 +200,7 @@ def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
         return data
 
     def encode_strings_as_bytearray(self, data: np.ndarray) -> np.ndarray:
-        if data.dtype.kind == "U":
+        if self.is_chardata and data.dtype.kind == "U":
             # N.B. it is also possible to pass a byte array (dtype "S1"),
             #  to be written directly, without processing.
             try:

From ef11375e99ccfff6bf780cdef732a98e2a3ffc2a Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 28 Jan 2026 15:15:29 +0000
Subject: [PATCH 36/43] Pre-clear load problems.

---
 lib/iris/tests/integration/netcdf/test_stringdata.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
index bc308d474c..5050152042 100644
--- a/lib/iris/tests/integration/netcdf/test_stringdata.py
+++ b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -195,6 +195,11 @@ def load_problems_list():
 class TestReadEncodings:
     """Test loading of testfiles with encoded string data."""
 
+    @pytest.fixture(autouse=True)
+    def _clear_load_problems(self):
+        iris.loading.LOAD_PROBLEMS.reset()
+        yield
+
     @pytest.fixture(params=["coordsSameDim", "coordsOwnDim"])
     def use_separate_dims(self, request):
         yield request.param == "coordsOwnDim"

From 2dbdcba0688036e3462962b2bbaf2c50bfa5b2dc Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 27 Feb 2026 16:46:23 +0000
Subject: [PATCH 37/43] Fix mock patches.

---
 .../fileformats/netcdf/saver/test_Saver.py    | 55 ++++++++++++-------
 .../saver/test_Saver__lazy_stream_data.py     |  2 +-
 .../netcdf/saver/test_Saver__ugrid.py         |  8 +--
 3 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py
index 0eb12d794c..374cb4815e 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver.py
@@ -35,7 +35,8 @@
 )
 from iris.coords import AncillaryVariable, AuxCoord, DimCoord
 from iris.cube import Cube
-from iris.fileformats.netcdf import Saver, _thread_safe_nc
+from iris.fileformats.netcdf import Saver
+from iris.fileformats.netcdf import _bytecoding_datasets as ds_wrappers
 from iris.tests._shared_utils import assert_CDL
 import iris.tests.stock as stock
 
@@ -219,7 +220,7 @@ def test_big_endian(self):
 
     def test_zlib(self):
         cube = self._simple_cube(">f4")
-        api = self.patch("iris.fileformats.netcdf.saver._thread_safe_nc")
+        api = self.patch("iris.fileformats.netcdf.saver.bytecoding_datasets")
         # Define mocked default fill values to prevent deprecation warning (#4374).
         api.default_fillvals = collections.defaultdict(lambda: -99.0)
         # Mock the apparent dtype of mocked variables, to avoid an error.
@@ -230,7 +231,7 @@ def test_zlib(self):
         # a fill-value report on a non-compliant variable in a non-file (!)
         with Saver("/dummy/path", "NETCDF4", compute=False) as saver:
             saver.write(cube, zlib=True)
-        dataset = api.DatasetWrapper.return_value
+        dataset = api.EncodedDataset.return_value
         create_var_call = mock.call(
             "air_pressure_anomaly",
             np.dtype("float32"),
@@ -270,8 +271,12 @@ def test_compression(self):
 
         with self.temp_filename(suffix=".nc") as nc_path:
             with Saver(nc_path, "NETCDF4", compute=False) as saver:
+                tgt = (
+                    "iris.fileformats.netcdf.saver.bytecoding_datasets"
+                    ".EncodedDataset.createVariable"
+                )
                 createvar_spy = self.patch(
-                    "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+                    tgt,
                     # Use 'wraps' to allow the patched methods to function as normal
                     #  - the patch object just acts as a 'spy' on its calls.
                     wraps=saver._dataset.createVariable,
@@ -306,8 +311,12 @@ def test_non_compression__shape(self):
 
         with self.temp_filename(suffix=".nc") as nc_path:
             with Saver(nc_path, "NETCDF4", compute=False) as saver:
+                tgt = (
+                    "iris.fileformats.netcdf.saver.bytecoding_datasets"
+                    ".EncodedDataset.createVariable"
+                )
                 createvar_spy = self.patch(
-                    "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+                    tgt,
                     # Use 'wraps' to allow the patched methods to function as normal
                     #  - the patch object just acts as a 'spy' on its calls.
                     wraps=saver._dataset.createVariable,
@@ -342,8 +351,12 @@ def test_non_compression__dtype(self):
 
         with self.temp_filename(suffix=".nc") as nc_path:
             with Saver(nc_path, "NETCDF4", compute=False) as saver:
+                tgt = (
+                    "iris.fileformats.netcdf.saver.bytecoding_datasets"
+                    ".EncodedDataset.createVariable"
+                )
                 createvar_spy = self.patch(
-                    "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+                    tgt,
                     # Use 'wraps' to allow the patched methods to function as normal
                     #  - the patch object just acts as a 'spy' on its calls.
                     wraps=saver._dataset.createVariable,
@@ -382,7 +395,7 @@ def test_default_unlimited_dimensions(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube)
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             self.assertFalse(ds.dimensions["dim0"].isunlimited())
             self.assertFalse(ds.dimensions["dim1"].isunlimited())
             ds.close()
@@ -392,7 +405,7 @@ def test_no_unlimited_dimensions(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=None)
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             for dim in ds.dimensions.values():
                 self.assertFalse(dim.isunlimited())
             ds.close()
@@ -414,7 +427,7 @@ def test_custom_unlimited_dimensions(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=unlimited_dimensions)
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             for dim in unlimited_dimensions:
                 self.assertTrue(ds.dimensions[dim].isunlimited())
             ds.close()
@@ -423,7 +436,7 @@ def test_custom_unlimited_dimensions(self):
             coords = [cube.coord(dim) for dim in unlimited_dimensions]
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=coords)
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             for dim in unlimited_dimensions:
                 self.assertTrue(ds.dimensions[dim].isunlimited())
             ds.close()
@@ -434,7 +447,7 @@ def test_reserved_attributes(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube)
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             res = ds.getncattr("dimensions")
             ds.close()
             self.assertEqual(res, "something something_else")
@@ -456,7 +469,7 @@ def test_dimensional_to_scalar(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube)
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             # Confirm that the only dimension is the one denoting the number
             #  of bounds - have successfully saved the 2D bounds array into 1D.
             self.assertEqual(["bnds"], list(ds.dimensions.keys()))
@@ -496,7 +509,7 @@ def _check_bounds_setting(self, climatological=False):
         saver._ensure_valid_dtype.return_value = mock.Mock(
             shape=coord.bounds.shape, dtype=coord.bounds.dtype
         )
-        var = mock.MagicMock(spec=_thread_safe_nc.VariableWrapper)
+        var = mock.MagicMock(spec=ds_wrappers.EncodedVariable)
 
         # Make the main call.
         Saver._create_cf_bounds(saver, coord, var, "time")
@@ -537,7 +550,7 @@ def test_valid_range_saved(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=[])
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             self.assertArrayEqual(ds.valid_range, vrange)
             ds.close()
 
@@ -549,7 +562,7 @@ def test_valid_min_saved(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=[])
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             self.assertArrayEqual(ds.valid_min, 1)
             ds.close()
 
@@ -561,7 +574,7 @@ def test_valid_max_saved(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=[])
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             self.assertArrayEqual(ds.valid_max, 2)
             ds.close()
 
@@ -581,7 +594,7 @@ def test_valid_range_saved(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=[])
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             self.assertArrayEqual(ds.variables["longitude"].valid_range, vrange)
             ds.close()
 
@@ -593,7 +606,7 @@ def test_valid_min_saved(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=[])
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             self.assertArrayEqual(ds.variables["longitude"].valid_min, 1)
             ds.close()
 
@@ -605,7 +618,7 @@ def test_valid_max_saved(self):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, unlimited_dimensions=[])
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             self.assertArrayEqual(ds.variables["longitude"].valid_max, 2)
             ds.close()
 
@@ -637,7 +650,7 @@ def _netCDF_var(self, cube, **kwargs):
         with self.temp_filename(".nc") as nc_path:
             with Saver(nc_path, "NETCDF4") as saver:
                 saver.write(cube, **kwargs)
-            ds = _thread_safe_nc.DatasetWrapper(nc_path)
+            ds = ds_wrappers.EncodedDataset(nc_path)
             (var,) = [
                 var
                 for var in ds.variables.values()
@@ -714,7 +727,7 @@ def setUp(self):
             )
         )
         patch = mock.patch(
-            "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper",
+            "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset",
             dataset_class,
         )
         _ = patch.start()
diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py
index 7c884e4c22..3b76dca13b 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__lazy_stream_data.py
@@ -30,7 +30,7 @@ def saver_patch():
         mock_dataset = mock.MagicMock()
         mock_dataset_class = mock.Mock(return_value=mock_dataset)
         # Mock the wrapper within the netcdf saver
-        target1 = "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper"
+        target1 = "iris.fileformats.netcdf.saver.bytecoding_datasets.DatasetWrapper"
         # Mock the real netCDF4.Dataset within the threadsafe-nc module, as this is
         # used by NetCDFDataProxy and NetCDFWriteProxy.
         target2 = "iris.fileformats.netcdf._thread_safe_nc.netCDF4.Dataset"
diff --git a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py
index 9494eabebf..571237512d 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/saver/test_Saver__ugrid.py
@@ -401,12 +401,12 @@ def test_compression(self):
         # into the iris.fileformats.netcdf.saver. Also we want to check that the
         # compression kwargs are passed into the NetCDF4 createVariable method
         patch = self.patch(
-            "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+            "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.createVariable",
         )
         # No need to patch this NetCDF4 variable to compensate for the previous patch
         # on createVariable, which doesn't actually create the variable.
         self.patch(
-            "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.variables"
+            "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.variables"
         )
         cube = make_cube(var_name=(var_name := "a"))
         compression_kwargs = {
@@ -785,10 +785,10 @@ def test_compression(self):
 
         """
         patch = self.patch(
-            "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.createVariable",
+            "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.createVariable",
         )
         self.patch(
-            "iris.fileformats.netcdf.saver._thread_safe_nc.DatasetWrapper.variables"
+            "iris.fileformats.netcdf.saver.bytecoding_datasets.EncodedDataset.variables"
         )
         mesh = make_mesh()
         compression_kwargs = {

From a34ea09d635eda36ba0dd63f170cc966d85b887f Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 27 Feb 2026 16:57:49 +0000
Subject: [PATCH 38/43] Fix patches in test_CFReader.

---
 lib/iris/tests/unit/fileformats/cf/test_CFReader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/cf/test_CFReader.py b/lib/iris/tests/unit/fileformats/cf/test_CFReader.py
index 7f37eb9f24..522d157fb1 100644
--- a/lib/iris/tests/unit/fileformats/cf/test_CFReader.py
+++ b/lib/iris/tests/unit/fileformats/cf/test_CFReader.py
@@ -78,7 +78,7 @@ def _setup(self, mocker):
             getncattr=getncattr,
         )
         mocker.patch(
-            "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper",
+            "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset",
             return_value=dataset,
         )
 
@@ -141,7 +141,7 @@ def _setup(self, mocker):
         mocker.patch("iris.fileformats.cf.CFReader._build_cf_groups")
         mocker.patch("iris.fileformats.cf.CFReader._reset")
         mocker.patch(
-            "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper",
+            "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset",
             return_value=self.dataset,
         )
 
@@ -237,7 +237,7 @@ def _setup(self, mocker):
         # and building first level cf-groups for variables.
         mocker.patch("iris.fileformats.cf.CFReader._reset")
         mocker.patch(
-            "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper",
+            "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset",
             return_value=self.dataset,
         )
 
@@ -373,7 +373,7 @@ def _setup_class(self, mocker):
         # translations and building first level cf-groups for variables.
         mocker.patch("iris.fileformats.cf.CFReader._reset")
         mocker.patch(
-            "iris.fileformats.netcdf._thread_safe_nc.DatasetWrapper",
+            "iris.fileformats.netcdf._bytecoding_datasets.EncodedDataset",
             return_value=self.dataset,
         )
         cf_reader = CFReader("dummy")

From aa1fe03ebb060a699fb68a3460b20bbeda5e42ce Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 27 Feb 2026 17:41:22 +0000
Subject: [PATCH 39/43] Fix variable creation in odd cases.

---
 lib/iris/fileformats/netcdf/saver.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
index d43df538c2..3d9f9a91a2 100644
--- a/lib/iris/fileformats/netcdf/saver.py
+++ b/lib/iris/fileformats/netcdf/saver.py
@@ -1713,11 +1713,14 @@ def add_names_attrs():
         if element.units.calendar:
             _setncattr(cf_var, "calendar", str(element.units.calendar))
 
-        # Most attributes are dealt with later.
-        # But _Encoding need to be defined before we can write to a character variable
-        if element.dtype.kind in "SU" and "_Encoding" in element.attributes:
-            encoding = element.attributes.pop("_Encoding")
-            _setncattr(cf_var, "_Encoding", encoding)
+        # Note: when writing UGRID, "element" can be a Mesh which has no "dtype",
+        # and for dataless cubes it will have a 'None' dtype.
+        if getattr(element, "dtype", None) is not None:
+            # Most attributes are dealt with later.  But _Encoding needs to be defined
+            #  *before* we can write to a character variable.
+            if element.dtype.kind in "SU" and "_Encoding" in element.attributes:
+                encoding = element.attributes.pop("_Encoding")
+                _setncattr(cf_var, "_Encoding", encoding)
 
         if not isinstance(element, Cube):
             # Add any other custom coordinate attributes.

From f5d50ee4a21c186e0e89c059f3c19b18da98a514 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 27 Feb 2026 18:05:43 +0000
Subject: [PATCH 40/43] Ignore attribute reordering in scaling-packed saves.

---
 .../multi_packed_multi_dtype.cdl              | 68 ------------------
 .../multi_packed_single_dtype.cdl             | 70 -------------------
 .../TestPackedData/single_packed_manual.cdl   | 50 -------------
 .../TestPackedData/single_packed_signed.cdl   | 50 -------------
 .../TestPackedData/single_packed_unsigned.cdl | 50 -------------
 5 files changed, 288 deletions(-)
 delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl
 delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl
 delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl
 delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl
 delete mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl

diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl
deleted file mode 100644
index 8a8f481492..0000000000
--- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl
+++ /dev/null
@@ -1,68 +0,0 @@
-dimensions:
-	bnds = 2 ;
-	latitude = 73 ;
-	longitude = 96 ;
-	time = 360 ;
-variables:
-	short air_temperature(time, latitude, longitude) ;
-		air_temperature:scale_factor = 0.00242575f ;
-		air_temperature:add_offset = 261.648f ;
-		air_temperature:standard_name = "air_temperature" ;
-		air_temperature:units = "K" ;
-		air_temperature:um_stash_source = "m01s03i236" ;
-		air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ;
-		air_temperature:grid_mapping = "latitude_longitude" ;
-		air_temperature:coordinates = "forecast_period forecast_reference_time height" ;
-	int latitude_longitude ;
-		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
-		latitude_longitude:longitude_of_prime_meridian = 0. ;
-		latitude_longitude:earth_radius = 6371229. ;
-	double time(time) ;
-		time:axis = "T" ;
-		time:bounds = "time_bnds" ;
-		time:units = "hours since 1970-01-01 00:00:00" ;
-		time:standard_name = "time" ;
-		time:calendar = "360_day" ;
-	double time_bnds(time, bnds) ;
-	float latitude(latitude) ;
-		latitude:axis = "Y" ;
-		latitude:units = "degrees_north" ;
-		latitude:standard_name = "latitude" ;
-	float longitude(longitude) ;
-		longitude:axis = "X" ;
-		longitude:units = "degrees_east" ;
-		longitude:standard_name = "longitude" ;
-	double forecast_period(time) ;
-		forecast_period:bounds = "forecast_period_bnds" ;
-		forecast_period:units = "hours" ;
-		forecast_period:standard_name = "forecast_period" ;
-	double forecast_period_bnds(time, bnds) ;
-	double forecast_reference_time ;
-		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
-		forecast_reference_time:standard_name = "forecast_reference_time" ;
-		forecast_reference_time:calendar = "360_day" ;
-	double height ;
-		height:units = "m" ;
-		height:standard_name = "height" ;
-		height:positive = "up" ;
-	float precipitation_flux(time, latitude, longitude) ;
-		precipitation_flux:standard_name = "precipitation_flux" ;
-		precipitation_flux:units = "kg m-2 s-1" ;
-		precipitation_flux:um_stash_source = "m01s05i216" ;
-		precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ;
-		precipitation_flux:grid_mapping = "latitude_longitude" ;
-		precipitation_flux:coordinates = "forecast_period forecast_reference_time" ;
-	ushort air_temperature_0(time, latitude, longitude) ;
-		air_temperature_0:scale_factor = 0.002014167f ;
-		air_temperature_0:add_offset = 176.7872f ;
-		air_temperature_0:standard_name = "air_temperature" ;
-		air_temperature_0:units = "K" ;
-		air_temperature_0:um_stash_source = "m01s03i236" ;
-		air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ;
-		air_temperature_0:grid_mapping = "latitude_longitude" ;
-		air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ;
-
-// global attributes:
-		:source = "Data from Met Office Unified Model" ;
-		:Conventions = "CF-1.7" ;
-}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl
deleted file mode 100644
index 3f2c909ce8..0000000000
--- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl
+++ /dev/null
@@ -1,70 +0,0 @@
-dimensions:
-	bnds = 2 ;
-	latitude = 73 ;
-	longitude = 96 ;
-	time = 360 ;
-variables:
-	short air_temperature(time, latitude, longitude) ;
-		air_temperature:scale_factor = 0.00242575f ;
-		air_temperature:add_offset = 261.648f ;
-		air_temperature:standard_name = "air_temperature" ;
-		air_temperature:units = "K" ;
-		air_temperature:um_stash_source = "m01s03i236" ;
-		air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ;
-		air_temperature:grid_mapping = "latitude_longitude" ;
-		air_temperature:coordinates = "forecast_period forecast_reference_time height" ;
-	int latitude_longitude ;
-		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
-		latitude_longitude:longitude_of_prime_meridian = 0. ;
-		latitude_longitude:earth_radius = 6371229. ;
-	double time(time) ;
-		time:axis = "T" ;
-		time:bounds = "time_bnds" ;
-		time:units = "hours since 1970-01-01 00:00:00" ;
-		time:standard_name = "time" ;
-		time:calendar = "360_day" ;
-	double time_bnds(time, bnds) ;
-	float latitude(latitude) ;
-		latitude:axis = "Y" ;
-		latitude:units = "degrees_north" ;
-		latitude:standard_name = "latitude" ;
-	float longitude(longitude) ;
-		longitude:axis = "X" ;
-		longitude:units = "degrees_east" ;
-		longitude:standard_name = "longitude" ;
-	double forecast_period(time) ;
-		forecast_period:bounds = "forecast_period_bnds" ;
-		forecast_period:units = "hours" ;
-		forecast_period:standard_name = "forecast_period" ;
-	double forecast_period_bnds(time, bnds) ;
-	double forecast_reference_time ;
-		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
-		forecast_reference_time:standard_name = "forecast_reference_time" ;
-		forecast_reference_time:calendar = "360_day" ;
-	double height ;
-		height:units = "m" ;
-		height:standard_name = "height" ;
-		height:positive = "up" ;
-	short precipitation_flux(time, latitude, longitude) ;
-		precipitation_flux:scale_factor = 2.989738e-08f ;
-		precipitation_flux:add_offset = 0.0009796774f ;
-		precipitation_flux:standard_name = "precipitation_flux" ;
-		precipitation_flux:units = "kg m-2 s-1" ;
-		precipitation_flux:um_stash_source = "m01s05i216" ;
-		precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ;
-		precipitation_flux:grid_mapping = "latitude_longitude" ;
-		precipitation_flux:coordinates = "forecast_period forecast_reference_time" ;
-	short air_temperature_0(time, latitude, longitude) ;
-		air_temperature_0:scale_factor = 0.002014167f ;
-		air_temperature_0:add_offset = 242.7874f ;
-		air_temperature_0:standard_name = "air_temperature" ;
-		air_temperature_0:units = "K" ;
-		air_temperature_0:um_stash_source = "m01s03i236" ;
-		air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ;
-		air_temperature_0:grid_mapping = "latitude_longitude" ;
-		air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ;
-
-// global attributes:
-		:source = "Data from Met Office Unified Model" ;
-		:Conventions = "CF-1.7" ;
-}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl
deleted file mode 100644
index 83e7329575..0000000000
--- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl
+++ /dev/null
@@ -1,50 +0,0 @@
-dimensions:
-	bnds = 2 ;
-	latitude = 73 ;
-	longitude = 96 ;
-variables:
-	short air_temperature(latitude, longitude) ;
-		air_temperature:scale_factor = 0.001198068f ;
-		air_temperature:add_offset = 267.4006f ;
-		air_temperature:standard_name = "air_temperature" ;
-		air_temperature:units = "K" ;
-		air_temperature:um_stash_source = "m01s03i236" ;
-		air_temperature:cell_methods = "time: mean (interval: 6 hour)" ;
-		air_temperature:grid_mapping = "latitude_longitude" ;
-		air_temperature:coordinates = "forecast_period forecast_reference_time height time" ;
-	int latitude_longitude ;
-		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
-		latitude_longitude:longitude_of_prime_meridian = 0. ;
-		latitude_longitude:earth_radius = 6371229. ;
-	float latitude(latitude) ;
-		latitude:axis = "Y" ;
-		latitude:units = "degrees_north" ;
-		latitude:standard_name = "latitude" ;
-	float longitude(longitude) ;
-		longitude:axis = "X" ;
-		longitude:units = "degrees_east" ;
-		longitude:standard_name = "longitude" ;
-	double forecast_period ;
-		forecast_period:bounds = "forecast_period_bnds" ;
-		forecast_period:units = "hours" ;
-		forecast_period:standard_name = "forecast_period" ;
-	double forecast_period_bnds(bnds) ;
-	double forecast_reference_time ;
-		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
-		forecast_reference_time:standard_name = "forecast_reference_time" ;
-		forecast_reference_time:calendar = "standard" ;
-	double height ;
-		height:units = "m" ;
-		height:standard_name = "height" ;
-		height:positive = "up" ;
-	double time ;
-		time:bounds = "time_bnds" ;
-		time:units = "hours since 1970-01-01 00:00:00" ;
-		time:standard_name = "time" ;
-		time:calendar = "standard" ;
-	double time_bnds(bnds) ;
-
-// global attributes:
-		:source = "Data from Met Office Unified Model" ;
-		:Conventions = "CF-1.7" ;
-}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl
deleted file mode 100644
index 83e7329575..0000000000
--- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl
+++ /dev/null
@@ -1,50 +0,0 @@
-dimensions:
-	bnds = 2 ;
-	latitude = 73 ;
-	longitude = 96 ;
-variables:
-	short air_temperature(latitude, longitude) ;
-		air_temperature:scale_factor = 0.001198068f ;
-		air_temperature:add_offset = 267.4006f ;
-		air_temperature:standard_name = "air_temperature" ;
-		air_temperature:units = "K" ;
-		air_temperature:um_stash_source = "m01s03i236" ;
-		air_temperature:cell_methods = "time: mean (interval: 6 hour)" ;
-		air_temperature:grid_mapping = "latitude_longitude" ;
-		air_temperature:coordinates = "forecast_period forecast_reference_time height time" ;
-	int latitude_longitude ;
-		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
-		latitude_longitude:longitude_of_prime_meridian = 0. ;
-		latitude_longitude:earth_radius = 6371229. ;
-	float latitude(latitude) ;
-		latitude:axis = "Y" ;
-		latitude:units = "degrees_north" ;
-		latitude:standard_name = "latitude" ;
-	float longitude(longitude) ;
-		longitude:axis = "X" ;
-		longitude:units = "degrees_east" ;
-		longitude:standard_name = "longitude" ;
-	double forecast_period ;
-		forecast_period:bounds = "forecast_period_bnds" ;
-		forecast_period:units = "hours" ;
-		forecast_period:standard_name = "forecast_period" ;
-	double forecast_period_bnds(bnds) ;
-	double forecast_reference_time ;
-		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
-		forecast_reference_time:standard_name = "forecast_reference_time" ;
-		forecast_reference_time:calendar = "standard" ;
-	double height ;
-		height:units = "m" ;
-		height:standard_name = "height" ;
-		height:positive = "up" ;
-	double time ;
-		time:bounds = "time_bnds" ;
-		time:units = "hours since 1970-01-01 00:00:00" ;
-		time:standard_name = "time" ;
-		time:calendar = "standard" ;
-	double time_bnds(bnds) ;
-
-// global attributes:
-		:source = "Data from Met Office Unified Model" ;
-		:Conventions = "CF-1.7" ;
-}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl
deleted file mode 100644
index 7b9114309e..0000000000
--- a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl
+++ /dev/null
@@ -1,50 +0,0 @@
-dimensions:
-	bnds = 2 ;
-	latitude = 73 ;
-	longitude = 96 ;
-variables:
-	ubyte air_temperature(latitude, longitude) ;
-		air_temperature:scale_factor = 0.3079035f ;
-		air_temperature:add_offset = 228.1423f ;
-		air_temperature:standard_name = "air_temperature" ;
-		air_temperature:units = "K" ;
-		air_temperature:um_stash_source = "m01s03i236" ;
-		air_temperature:cell_methods = "time: mean (interval: 6 hour)" ;
-		air_temperature:grid_mapping = "latitude_longitude" ;
-		air_temperature:coordinates = "forecast_period forecast_reference_time height time" ;
-	int latitude_longitude ;
-		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
-		latitude_longitude:longitude_of_prime_meridian = 0. ;
-		latitude_longitude:earth_radius = 6371229. ;
-	float latitude(latitude) ;
-		latitude:axis = "Y" ;
-		latitude:units = "degrees_north" ;
-		latitude:standard_name = "latitude" ;
-	float longitude(longitude) ;
-		longitude:axis = "X" ;
-		longitude:units = "degrees_east" ;
-		longitude:standard_name = "longitude" ;
-	double forecast_period ;
-		forecast_period:bounds = "forecast_period_bnds" ;
-		forecast_period:units = "hours" ;
-		forecast_period:standard_name = "forecast_period" ;
-	double forecast_period_bnds(bnds) ;
-	double forecast_reference_time ;
-		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
-		forecast_reference_time:standard_name = "forecast_reference_time" ;
-		forecast_reference_time:calendar = "standard" ;
-	double height ;
-		height:units = "m" ;
-		height:standard_name = "height" ;
-		height:positive = "up" ;
-	double time ;
-		time:bounds = "time_bnds" ;
-		time:units = "hours since 1970-01-01 00:00:00" ;
-		time:standard_name = "time" ;
-		time:calendar = "standard" ;
-	double time_bnds(bnds) ;
-
-// global attributes:
-		:source = "Data from Met Office Unified Model" ;
-		:Conventions = "CF-1.7" ;
-}

From b2c6d51d8003ae8f8a34885b0caa5f0f30baf5ec Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 27 Feb 2026 18:35:47 +0000
Subject: [PATCH 41/43] Fix test for refactored proxy constructor.

---
 .../helpers/test_build_and_add_auxiliary_coordinate.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py
index 5ed3413409..94540d4ab9 100644
--- a/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py
+++ b/lib/iris/tests/unit/fileformats/nc_load_rules/helpers/test_build_and_add_auxiliary_coordinate.py
@@ -171,7 +171,7 @@ class TestDtype(tests.IrisTest):
     def setUp(self):
         # Create coordinate cf variables and pyke engine.
         points = np.arange(6).reshape(2, 3)
-        cf_data = mock.MagicMock(_FillValue=None)
+        cf_data = mock.MagicMock(_FillValue=None, shape=points.shape)
         cf_data.chunking = mock.MagicMock(return_value=points.shape)
 
         self.engine = mock.Mock(

From dfd4d918f3f083e17c197bc210e44e669a9d94b3 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 27 Feb 2026 18:56:41 +0000
Subject: [PATCH 42/43] Fix get_cf_var_data to support vlen-string.

---
 lib/iris/fileformats/netcdf/loader.py                      | 2 +-
 .../fileformats/netcdf/loader/test__get_cf_var_data.py     | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py
index 9607b393d9..32eea77db8 100644
--- a/lib/iris/fileformats/netcdf/loader.py
+++ b/lib/iris/fileformats/netcdf/loader.py
@@ -293,7 +293,7 @@ def _get_cf_var_data(cf_var):
             # Make a data-proxy that mimics array access and can fetch from the file.
             # Note: Special handling needed for "variable length string" types which
             # return a dtype of `str`, rather than a numpy type; use `S1` in this case.
-            if cf_var.dtype.kind == "U":
+            if getattr(cf_var.dtype, "kind", None) == "U":
                 # Special handling for "string variables".
                 fill_value = ""
             else:
diff --git a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py
index e29f0de012..876ce65f25 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py
@@ -26,14 +26,15 @@ def setUp(self):
         self.expected_chunks = _optimum_chunksize(self.shape, self.shape)
 
     def _make(self, chunksizes=None, shape=None, dtype="i4", **extra_properties):
+        if shape is None:
+            shape = self.shape
         cf_data = mock.MagicMock(
             _FillValue=None,
             __getitem__="<real-data>",
-            dimensions=["dim_" + str(x) for x in range(len(shape or "1"))],
+            dimensions=["dim_" + str(x) for x in range(len(shape))],
+            shape=shape,
         )
         cf_data.chunking = mock.MagicMock(return_value=chunksizes)
-        if shape is None:
-            shape = self.shape
         if dtype is not str:  # for testing VLen str arrays (dtype=`class <str>`)
             dtype = np.dtype(dtype)
         cf_var = mock.MagicMock(

From 274fae4014835c25b748e49d6a8ff1c880723802 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 27 Feb 2026 18:58:04 +0000
Subject: [PATCH 43/43] Add back new test results, folder removed in error.

---
 .../multi_packed_multi_dtype.cdl              | 68 ++++++++++++++++++
 .../multi_packed_single_dtype.cdl             | 70 +++++++++++++++++++
 .../TestPackedData/single_packed_manual.cdl   | 50 +++++++++++++
 .../TestPackedData/single_packed_signed.cdl   | 50 +++++++++++++
 .../TestPackedData/single_packed_unsigned.cdl | 50 +++++++++++++
 5 files changed, 288 insertions(+)
 create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl
 create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl
 create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl
 create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl
 create mode 100644 lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl

diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl
new file mode 100644
index 0000000000..27d8f55a45
--- /dev/null
+++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_multi_dtype.cdl
@@ -0,0 +1,68 @@
+dimensions:
+	bnds = 2 ;
+	latitude = 73 ;
+	longitude = 96 ;
+	time = 360 ;
+variables:
+	short air_temperature(time, latitude, longitude) ;
+		air_temperature:standard_name = "air_temperature" ;
+		air_temperature:units = "K" ;
+		air_temperature:scale_factor = 0.00242575f ;
+		air_temperature:add_offset = 261.648f ;
+		air_temperature:um_stash_source = "m01s03i236" ;
+		air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ;
+		air_temperature:grid_mapping = "latitude_longitude" ;
+		air_temperature:coordinates = "forecast_period forecast_reference_time height" ;
+	int latitude_longitude ;
+		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
+		latitude_longitude:longitude_of_prime_meridian = 0. ;
+		latitude_longitude:earth_radius = 6371229. ;
+	double time(time) ;
+		time:axis = "T" ;
+		time:bounds = "time_bnds" ;
+		time:units = "hours since 1970-01-01 00:00:00" ;
+		time:standard_name = "time" ;
+		time:calendar = "360_day" ;
+	double time_bnds(time, bnds) ;
+	float latitude(latitude) ;
+		latitude:axis = "Y" ;
+		latitude:units = "degrees_north" ;
+		latitude:standard_name = "latitude" ;
+	float longitude(longitude) ;
+		longitude:axis = "X" ;
+		longitude:units = "degrees_east" ;
+		longitude:standard_name = "longitude" ;
+	double forecast_period(time) ;
+		forecast_period:bounds = "forecast_period_bnds" ;
+		forecast_period:units = "hours" ;
+		forecast_period:standard_name = "forecast_period" ;
+	double forecast_period_bnds(time, bnds) ;
+	double forecast_reference_time ;
+		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
+		forecast_reference_time:standard_name = "forecast_reference_time" ;
+		forecast_reference_time:calendar = "360_day" ;
+	double height ;
+		height:units = "m" ;
+		height:standard_name = "height" ;
+		height:positive = "up" ;
+	float precipitation_flux(time, latitude, longitude) ;
+		precipitation_flux:standard_name = "precipitation_flux" ;
+		precipitation_flux:units = "kg m-2 s-1" ;
+		precipitation_flux:um_stash_source = "m01s05i216" ;
+		precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ;
+		precipitation_flux:grid_mapping = "latitude_longitude" ;
+		precipitation_flux:coordinates = "forecast_period forecast_reference_time" ;
+	ushort air_temperature_0(time, latitude, longitude) ;
+		air_temperature_0:standard_name = "air_temperature" ;
+		air_temperature_0:units = "K" ;
+		air_temperature_0:scale_factor = 0.002014167f ;
+		air_temperature_0:add_offset = 176.7872f ;
+		air_temperature_0:um_stash_source = "m01s03i236" ;
+		air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ;
+		air_temperature_0:grid_mapping = "latitude_longitude" ;
+		air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ;
+
+// global attributes:
+		:source = "Data from Met Office Unified Model" ;
+		:Conventions = "CF-1.7" ;
+}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl
new file mode 100644
index 0000000000..c85fd35efd
--- /dev/null
+++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/multi_packed_single_dtype.cdl
@@ -0,0 +1,70 @@
+dimensions:
+	bnds = 2 ;
+	latitude = 73 ;
+	longitude = 96 ;
+	time = 360 ;
+variables:
+	short air_temperature(time, latitude, longitude) ;
+		air_temperature:standard_name = "air_temperature" ;
+		air_temperature:units = "K" ;
+		air_temperature:scale_factor = 0.00242575f ;
+		air_temperature:add_offset = 261.648f ;
+		air_temperature:um_stash_source = "m01s03i236" ;
+		air_temperature:cell_methods = "time: maximum (interval: 1 hour)" ;
+		air_temperature:grid_mapping = "latitude_longitude" ;
+		air_temperature:coordinates = "forecast_period forecast_reference_time height" ;
+	int latitude_longitude ;
+		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
+		latitude_longitude:longitude_of_prime_meridian = 0. ;
+		latitude_longitude:earth_radius = 6371229. ;
+	double time(time) ;
+		time:axis = "T" ;
+		time:bounds = "time_bnds" ;
+		time:units = "hours since 1970-01-01 00:00:00" ;
+		time:standard_name = "time" ;
+		time:calendar = "360_day" ;
+	double time_bnds(time, bnds) ;
+	float latitude(latitude) ;
+		latitude:axis = "Y" ;
+		latitude:units = "degrees_north" ;
+		latitude:standard_name = "latitude" ;
+	float longitude(longitude) ;
+		longitude:axis = "X" ;
+		longitude:units = "degrees_east" ;
+		longitude:standard_name = "longitude" ;
+	double forecast_period(time) ;
+		forecast_period:bounds = "forecast_period_bnds" ;
+		forecast_period:units = "hours" ;
+		forecast_period:standard_name = "forecast_period" ;
+	double forecast_period_bnds(time, bnds) ;
+	double forecast_reference_time ;
+		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
+		forecast_reference_time:standard_name = "forecast_reference_time" ;
+		forecast_reference_time:calendar = "360_day" ;
+	double height ;
+		height:units = "m" ;
+		height:standard_name = "height" ;
+		height:positive = "up" ;
+	short precipitation_flux(time, latitude, longitude) ;
+		precipitation_flux:standard_name = "precipitation_flux" ;
+		precipitation_flux:units = "kg m-2 s-1" ;
+		precipitation_flux:scale_factor = 2.989738e-08f ;
+		precipitation_flux:add_offset = 0.0009796774f ;
+		precipitation_flux:um_stash_source = "m01s05i216" ;
+		precipitation_flux:cell_methods = "time: mean (interval: 1 hour)" ;
+		precipitation_flux:grid_mapping = "latitude_longitude" ;
+		precipitation_flux:coordinates = "forecast_period forecast_reference_time" ;
+	short air_temperature_0(time, latitude, longitude) ;
+		air_temperature_0:standard_name = "air_temperature" ;
+		air_temperature_0:units = "K" ;
+		air_temperature_0:scale_factor = 0.002014167f ;
+		air_temperature_0:add_offset = 242.7874f ;
+		air_temperature_0:um_stash_source = "m01s03i236" ;
+		air_temperature_0:cell_methods = "time: minimum (interval: 1 hour)" ;
+		air_temperature_0:grid_mapping = "latitude_longitude" ;
+		air_temperature_0:coordinates = "forecast_period forecast_reference_time height" ;
+
+// global attributes:
+		:source = "Data from Met Office Unified Model" ;
+		:Conventions = "CF-1.7" ;
+}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl
new file mode 100644
index 0000000000..ed89a25d9f
--- /dev/null
+++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_manual.cdl
@@ -0,0 +1,50 @@
+dimensions:
+	bnds = 2 ;
+	latitude = 73 ;
+	longitude = 96 ;
+variables:
+	short air_temperature(latitude, longitude) ;
+		air_temperature:standard_name = "air_temperature" ;
+		air_temperature:units = "K" ;
+		air_temperature:scale_factor = 0.001198068f ;
+		air_temperature:add_offset = 267.4006f ;
+		air_temperature:um_stash_source = "m01s03i236" ;
+		air_temperature:cell_methods = "time: mean (interval: 6 hour)" ;
+		air_temperature:grid_mapping = "latitude_longitude" ;
+		air_temperature:coordinates = "forecast_period forecast_reference_time height time" ;
+	int latitude_longitude ;
+		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
+		latitude_longitude:longitude_of_prime_meridian = 0. ;
+		latitude_longitude:earth_radius = 6371229. ;
+	float latitude(latitude) ;
+		latitude:axis = "Y" ;
+		latitude:units = "degrees_north" ;
+		latitude:standard_name = "latitude" ;
+	float longitude(longitude) ;
+		longitude:axis = "X" ;
+		longitude:units = "degrees_east" ;
+		longitude:standard_name = "longitude" ;
+	double forecast_period ;
+		forecast_period:bounds = "forecast_period_bnds" ;
+		forecast_period:units = "hours" ;
+		forecast_period:standard_name = "forecast_period" ;
+	double forecast_period_bnds(bnds) ;
+	double forecast_reference_time ;
+		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
+		forecast_reference_time:standard_name = "forecast_reference_time" ;
+		forecast_reference_time:calendar = "standard" ;
+	double height ;
+		height:units = "m" ;
+		height:standard_name = "height" ;
+		height:positive = "up" ;
+	double time ;
+		time:bounds = "time_bnds" ;
+		time:units = "hours since 1970-01-01 00:00:00" ;
+		time:standard_name = "time" ;
+		time:calendar = "standard" ;
+	double time_bnds(bnds) ;
+
+// global attributes:
+		:source = "Data from Met Office Unified Model" ;
+		:Conventions = "CF-1.7" ;
+}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl
new file mode 100644
index 0000000000..ed89a25d9f
--- /dev/null
+++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_signed.cdl
@@ -0,0 +1,50 @@
+dimensions:
+	bnds = 2 ;
+	latitude = 73 ;
+	longitude = 96 ;
+variables:
+	short air_temperature(latitude, longitude) ;
+		air_temperature:standard_name = "air_temperature" ;
+		air_temperature:units = "K" ;
+		air_temperature:scale_factor = 0.001198068f ;
+		air_temperature:add_offset = 267.4006f ;
+		air_temperature:um_stash_source = "m01s03i236" ;
+		air_temperature:cell_methods = "time: mean (interval: 6 hour)" ;
+		air_temperature:grid_mapping = "latitude_longitude" ;
+		air_temperature:coordinates = "forecast_period forecast_reference_time height time" ;
+	int latitude_longitude ;
+		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
+		latitude_longitude:longitude_of_prime_meridian = 0. ;
+		latitude_longitude:earth_radius = 6371229. ;
+	float latitude(latitude) ;
+		latitude:axis = "Y" ;
+		latitude:units = "degrees_north" ;
+		latitude:standard_name = "latitude" ;
+	float longitude(longitude) ;
+		longitude:axis = "X" ;
+		longitude:units = "degrees_east" ;
+		longitude:standard_name = "longitude" ;
+	double forecast_period ;
+		forecast_period:bounds = "forecast_period_bnds" ;
+		forecast_period:units = "hours" ;
+		forecast_period:standard_name = "forecast_period" ;
+	double forecast_period_bnds(bnds) ;
+	double forecast_reference_time ;
+		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
+		forecast_reference_time:standard_name = "forecast_reference_time" ;
+		forecast_reference_time:calendar = "standard" ;
+	double height ;
+		height:units = "m" ;
+		height:standard_name = "height" ;
+		height:positive = "up" ;
+	double time ;
+		time:bounds = "time_bnds" ;
+		time:units = "hours since 1970-01-01 00:00:00" ;
+		time:standard_name = "time" ;
+		time:calendar = "standard" ;
+	double time_bnds(bnds) ;
+
+// global attributes:
+		:source = "Data from Met Office Unified Model" ;
+		:Conventions = "CF-1.7" ;
+}
diff --git a/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl
new file mode 100644
index 0000000000..eedad33e03
--- /dev/null
+++ b/lib/iris/tests/results/integration/netcdf/general/TestPackedData/single_packed_unsigned.cdl
@@ -0,0 +1,50 @@
+dimensions:
+	bnds = 2 ;
+	latitude = 73 ;
+	longitude = 96 ;
+variables:
+	ubyte air_temperature(latitude, longitude) ;
+		air_temperature:standard_name = "air_temperature" ;
+		air_temperature:units = "K" ;
+		air_temperature:scale_factor = 0.3079035f ;
+		air_temperature:add_offset = 228.1423f ;
+		air_temperature:um_stash_source = "m01s03i236" ;
+		air_temperature:cell_methods = "time: mean (interval: 6 hour)" ;
+		air_temperature:grid_mapping = "latitude_longitude" ;
+		air_temperature:coordinates = "forecast_period forecast_reference_time height time" ;
+	int latitude_longitude ;
+		latitude_longitude:grid_mapping_name = "latitude_longitude" ;
+		latitude_longitude:longitude_of_prime_meridian = 0. ;
+		latitude_longitude:earth_radius = 6371229. ;
+	float latitude(latitude) ;
+		latitude:axis = "Y" ;
+		latitude:units = "degrees_north" ;
+		latitude:standard_name = "latitude" ;
+	float longitude(longitude) ;
+		longitude:axis = "X" ;
+		longitude:units = "degrees_east" ;
+		longitude:standard_name = "longitude" ;
+	double forecast_period ;
+		forecast_period:bounds = "forecast_period_bnds" ;
+		forecast_period:units = "hours" ;
+		forecast_period:standard_name = "forecast_period" ;
+	double forecast_period_bnds(bnds) ;
+	double forecast_reference_time ;
+		forecast_reference_time:units = "hours since 1970-01-01 00:00:00" ;
+		forecast_reference_time:standard_name = "forecast_reference_time" ;
+		forecast_reference_time:calendar = "standard" ;
+	double height ;
+		height:units = "m" ;
+		height:standard_name = "height" ;
+		height:positive = "up" ;
+	double time ;
+		time:bounds = "time_bnds" ;
+		time:units = "hours since 1970-01-01 00:00:00" ;
+		time:standard_name = "time" ;
+		time:calendar = "standard" ;
+	double time_bnds(bnds) ;
+
+// global attributes:
+		:source = "Data from Met Office Unified Model" ;
+		:Conventions = "CF-1.7" ;
+}