diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 84be8e384a..7001573f5b 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -11,6 +11,8 @@ a71cad2dd6ace5741a754e2ca7daacd4bb094e0e
 2c2402ed59c91164eaff46dee0f79386b7347e9e
 05b7c571544c3bcb153fce67d12b9ac48947fc2d
 c8f38049359170a34c915e209276238ea66b9a1e
+65880e4af16cb95438437bf057c4b9fdb427b142
+d1cfb8abd6aa5c76e6c1a4d7ab20929c65f8afc2
 8d5cb39e03c2644715a50684f8cd0318b4e82676
 ec69e8838be2dde140a915e50506f8e1ce3cb534
 f2bc0488a298f136294c523bc5ab4086d090059b
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8c4657fb0..11ff1df22e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,11 +296,12 @@ set(BOUT_SOURCES
     ./src/mesh/interpolation/hermite_spline_z.cxx
     ./src/mesh/interpolation/interpolation_z.cxx
     ./src/mesh/interpolation/lagrange_4pt_xz.cxx
-    ./src/mesh/interpolation/monotonic_hermite_spline_xz.cxx
     ./src/mesh/invert3x3.hxx
     ./src/mesh/mesh.cxx
     ./src/mesh/parallel/fci.cxx
     ./src/mesh/parallel/fci.hxx
+    ./src/mesh/parallel/fci_comm.cxx
+    ./src/mesh/parallel/fci_comm.hxx
     ./src/mesh/parallel/identity.cxx
     ./src/mesh/parallel/shiftedmetric.cxx
     ./src/mesh/parallel/shiftedmetricinterp.cxx
diff --git a/include/bout/interpolation_xz.hxx b/include/bout/interpolation_xz.hxx
index 4dd24259fd..d14a5eeca0 100644
--- a/include/bout/interpolation_xz.hxx
+++ b/include/bout/interpolation_xz.hxx
@@ -27,17 +27,18 @@
 #include <bout/bout_types.hxx>
 #include <bout/generic_factory.hxx>
 #include <bout/mask.hxx>
+#include <array>
 
-#define USE_NEW_WEIGHTS 1
 #if BOUT_HAS_PETSC
-#define HS_USE_PETSC 1
-#endif
-
-#ifdef HS_USE_PETSC
 #include "bout/petsclib.hxx"
 #endif
 
+namespace {
+enum class implementation_type { new_weights, petsc, legacy };
+}
+
 class Options;
+class GlobalField3DAccess;
 
 /// Interpolate a field onto a perturbed set of points
 const Field3D interpolate(const Field3D& f, const Field3D& delta_x,
@@ -135,7 +136,16 @@ public:
   }
 };
 
-class XZHermiteSpline : public XZInterpolation {
+/// Monotonic Hermite spline interpolator
+///
+/// Similar to XZHermiteSpline, so uses most of the same code.
+/// Forces the interpolated result to be in the range of the
+/// neighbouring cell values. This prevents unphysical overshoots,
+/// but also degrades accuracy near maxima and minima.
+/// Perhaps should only impose near boundaries, since that is where
+/// problems most obviously occur.
+template <bool monotonic, implementation_type imp_type>
+class XZHermiteSplineBase : public XZInterpolation {
 protected:
   /// This is protected rather than private so that it can be
   /// extended and used by HermiteSplineMonotonic
@@ -143,6 +153,9 @@ protected:
   Tensor<SpecificInd<IND_TYPE::IND_3D>> i_corner; // index of bottom-left grid point
   Tensor<int> k_corner;                           // z-index of bottom-left grid point
 
+  std::unique_ptr<GlobalField3DAccess> gf3daccess;
+  Tensor<std::array<int, 4>> g3dinds;
+
   // Basis functions for cubic Hermite spline interpolation
   //    see http://en.wikipedia.org/wiki/Cubic_Hermite_spline
   // The h00 and h01 basis functions are applied to the function itself
@@ -160,23 +173,28 @@ protected:
 
   std::vector<Field3D> newWeights;
 
-#if HS_USE_PETSC
+#if BOUT_HAS_PETSC
   PetscLib* petsclib;
   bool isInit{false};
-  Mat petscWeights;
-  Vec rhs, result;
+  Mat petscWeights{};
+  Vec rhs{}, result{};
 #endif
 
+  /// Factors to allow for some wiggleroom
+  BoutReal abs_fac_monotonic{1e-2};
+  BoutReal rel_fac_monotonic{1e-1};
+
 public:
-  XZHermiteSpline(Mesh* mesh = nullptr, [[maybe_unused]] Options* options = nullptr)
-      : XZHermiteSpline(0, mesh) {}
-  XZHermiteSpline(int y_offset = 0, Mesh* mesh = nullptr);
-  XZHermiteSpline(const BoutMask& mask, int y_offset = 0, Mesh* mesh = nullptr)
-      : XZHermiteSpline(y_offset, mesh) {
+  XZHermiteSplineBase(Mesh* mesh = nullptr, [[maybe_unused]] Options* options = nullptr)
+      : XZHermiteSplineBase(0, mesh, options) {}
+  XZHermiteSplineBase(int y_offset = 0, Mesh* mesh = nullptr, Options* options = nullptr);
+  XZHermiteSplineBase(const BoutMask& mask, int y_offset = 0, Mesh* mesh = nullptr,
+                      Options* options = nullptr)
+      : XZHermiteSplineBase(y_offset, mesh, options) {
     setRegion(regionFromMask(mask, localmesh));
   }
-  ~XZHermiteSpline() {
-#if HS_USE_PETSC
+  ~XZHermiteSplineBase() override {
+#if BOUT_HAS_PETSC
     if (isInit) {
       MatDestroy(&petscWeights);
       VecDestroy(&rhs);
@@ -205,61 +223,21 @@ public:
   getWeightsForYApproximation(int i, int j, int k, int yoffset) override;
 };
 
-/// Monotonic Hermite spline interpolator
-///
-/// Similar to XZHermiteSpline, so uses most of the same code.
-/// Forces the interpolated result to be in the range of the
-/// neighbouring cell values. This prevents unphysical overshoots,
-/// but also degrades accuracy near maxima and minima.
-/// Perhaps should only impose near boundaries, since that is where
-/// problems most obviously occur.
-///
-/// You can control how tight the clipping to the range of the neighbouring cell
-/// values through ``rtol`` and ``atol``:
-///
-///     diff = (max_of_neighours - min_of_neighours) * rtol + atol
-///
-/// and the interpolated value is instead clipped to the range
-/// ``[min_of_neighours - diff, max_of_neighours + diff]``
-class XZMonotonicHermiteSpline : public XZHermiteSpline {
-  /// Absolute tolerance for clipping
-  BoutReal atol = 0.0;
-  /// Relative tolerance for clipping
-  BoutReal rtol = 1.0;
-
-public:
-  XZMonotonicHermiteSpline(Mesh* mesh = nullptr, Options* options = nullptr)
-      : XZHermiteSpline(0, mesh),
-        atol{(*options)["atol"]
-                 .doc("Absolute tolerance for clipping overshoot")
-                 .withDefault(0.0)},
-        rtol{(*options)["rtol"]
-                 .doc("Relative tolerance for clipping overshoot")
-                 .withDefault(1.0)} {
-    if (localmesh->getNXPE() > 1) {
-      throw BoutException("Do not support MPI splitting in X");
-    }
-  }
-  XZMonotonicHermiteSpline(int y_offset = 0, Mesh* mesh = nullptr)
-      : XZHermiteSpline(y_offset, mesh) {
-    if (localmesh->getNXPE() > 1) {
-      throw BoutException("Do not support MPI splitting in X");
-    }
-  }
-  XZMonotonicHermiteSpline(const BoutMask& mask, int y_offset = 0, Mesh* mesh = nullptr)
-      : XZHermiteSpline(mask, y_offset, mesh) {
-    if (localmesh->getNXPE() > 1) {
-      throw BoutException("Do not support MPI splitting in X");
-    }
-  }
-
-  using XZHermiteSpline::interpolate;
-  /// Interpolate using precalculated weights.
-  /// This function is called by the other interpolate functions
-  /// in the base class XZHermiteSpline.
-  Field3D interpolate(const Field3D& f,
-                      const std::string& region = "RGN_NOBNDRY") const override;
-};
+using XZMonotonicHermiteSplineSerial =
+    XZHermiteSplineBase<true, implementation_type::new_weights>;
+using XZHermiteSplineSerial =
+    XZHermiteSplineBase<false, implementation_type::new_weights>;
+using XZMonotonicHermiteSplineLegacy =
+    XZHermiteSplineBase<true, implementation_type::legacy>;
+using XZHermiteSplineLegacy = XZHermiteSplineBase<false, implementation_type::legacy>;
+#if BOUT_HAS_PETSC
+using XZMonotonicHermiteSpline = XZHermiteSplineBase<true, implementation_type::petsc>;
+using XZHermiteSpline = XZHermiteSplineBase<false, implementation_type::petsc>;
+#else
+using XZMonotonicHermiteSpline =
+    XZHermiteSplineBase<true, implementation_type::new_weights>;
+using XZHermiteSpline = XZHermiteSplineBase<false, implementation_type::new_weights>;
+#endif
 
 /// XZLagrange4pt interpolation class
 ///
diff --git a/include/bout/mesh.hxx b/include/bout/mesh.hxx
index 502b0c5668..a1ed6a9011 100644
--- a/include/bout/mesh.hxx
+++ b/include/bout/mesh.hxx
@@ -360,6 +360,8 @@ public:
   virtual int getXProcIndex() const = 0; ///< This processor's index in X direction
   virtual int getYProcIndex() const = 0; ///< This processor's index in Y direction
   virtual int getZProcIndex() const = 0; ///< This processor's index in Z direction
+  /// The index of a processor with given X, Y, and Z index
+  virtual int getProcIndex(int X, int Y, int Z) const = 0;
 
   // X communications
   virtual bool firstX()
diff --git a/include/bout/msg_stack.hxx b/include/bout/msg_stack.hxx
index 1abb26d2c7..0fe22cd0e6 100644
--- a/include/bout/msg_stack.hxx
+++ b/include/bout/msg_stack.hxx
@@ -80,7 +80,7 @@ public:
   }
 
   void pop() {}
-  void pop(int [[maybe_unused]] id) {}
+  void pop([[maybe_unused]] int id) {}
   void clear() {}
 
   void dump() {}
diff --git a/include/bout/region.hxx b/include/bout/region.hxx
index bb1cf82bf1..17ded65987 100644
--- a/include/bout/region.hxx
+++ b/include/bout/region.hxx
@@ -139,7 +139,7 @@ class BoutMask;
   BOUT_FOR_OMP(index, (region), for schedule(BOUT_OPENMP_SCHEDULE) nowait)
 // NOLINTEND(cppcoreguidelines-macro-usage,bugprone-macro-parentheses)
 
-enum class IND_TYPE { IND_3D = 0, IND_2D = 1, IND_PERP = 2 };
+enum class IND_TYPE { IND_3D = 0, IND_2D = 1, IND_PERP = 2, IND_GLOBAL_3D = 3 };
 
 /// Indices base class for Fields -- Regions are dereferenced into these
 ///
@@ -386,6 +386,7 @@ inline SpecificInd<N> operator-(SpecificInd<N> lhs, const SpecificInd<N>& rhs) {
 using Ind3D = SpecificInd<IND_TYPE::IND_3D>;
 using Ind2D = SpecificInd<IND_TYPE::IND_2D>;
 using IndPerp = SpecificInd<IND_TYPE::IND_PERP>;
+using IndG3D = SpecificInd<IND_TYPE::IND_GLOBAL_3D>;
 
 /// Get string representation of Ind3D
 inline std::string toString(const Ind3D& i) {
diff --git a/src/mesh/impls/bout/boutmesh.cxx b/src/mesh/impls/bout/boutmesh.cxx
index d0bdcf33d8..3dad85f211 100644
--- a/src/mesh/impls/bout/boutmesh.cxx
+++ b/src/mesh/impls/bout/boutmesh.cxx
@@ -534,6 +534,8 @@ int BoutMesh::load() {
   PE_XIND = MYPE % NXPE;
   PE_ZIND = 0;
 
+  ASSERT2(MYPE == getProcIndex(PE_XIND, PE_YIND, PE_ZIND));
+
   // Set the other grid sizes from nx, ny, nz
   setDerivedGridSizes();
 
@@ -1015,6 +1017,8 @@ void BoutMesh::createXBoundaries() {
   }
 }
 
+int BoutMesh::getProcIndex(int X, int Y, int Z) const { return Y * NXPE + X; }
+
 void BoutMesh::createYBoundaries() {
   if (MYG <= 0) {
     return;
diff --git a/src/mesh/impls/bout/boutmesh.hxx b/src/mesh/impls/bout/boutmesh.hxx
index 3923c34511..b42bf325b5 100644
--- a/src/mesh/impls/bout/boutmesh.hxx
+++ b/src/mesh/impls/bout/boutmesh.hxx
@@ -63,6 +63,7 @@ public:
   int getXProcIndex() const override; ///< This processor's index in X direction
   int getYProcIndex() const override; ///< This processor's index in Y direction
   int getZProcIndex() const override; ///< This processor's index in Z direction
+  int getProcIndex(int X, int Y, int Z) const override;
 
   /////////////////////////////////////////////
   // X communications
diff --git a/src/mesh/interpolation/hermite_spline_xz.cxx b/src/mesh/interpolation/hermite_spline_xz.cxx
index 27a4f1d614..48e7d2904e 100644
--- a/src/mesh/interpolation/hermite_spline_xz.cxx
+++ b/src/mesh/interpolation/hermite_spline_xz.cxx
@@ -21,11 +21,18 @@
  **************************************************************************/
 
 #include "../impls/bout/boutmesh.hxx"
+#include "../parallel/fci_comm.hxx"
 #include "bout/bout.hxx"
+#include "bout/boutexception.hxx"
+#include "bout/build_defines.hxx"
 #include "bout/globals.hxx"
 #include "bout/index_derivs_interface.hxx"
 #include "bout/interpolation_xz.hxx"
+#include "bout/mask.hxx"
 
+#include <algorithm>
+#include <memory>
+#include <string>
 #include <vector>
 
 class IndConverter {
@@ -35,9 +42,9 @@ class IndConverter {
         xstart(mesh->xstart), ystart(mesh->ystart), zstart(0),
         lnx(mesh->LocalNx - 2 * xstart), lny(mesh->LocalNy - 2 * ystart),
         lnz(mesh->LocalNz - 2 * zstart) {}
-  // ix and iy are global indices
+  // ix and iz are global indices
   // iy is local
-  int fromMeshToGlobal(int ix, int iy, int iz) {
+  int fromMeshToGlobal(int ix, int iy, int iz) const {
     const int xstart = mesh->xstart;
     const int lnx = mesh->LocalNx - xstart * 2;
     // x-proc-id
@@ -71,12 +78,12 @@ class IndConverter {
     return fromLocalToGlobal(ix - pex * lnx, iy - pey_offset * lny, iz - pez * lnz, pex,
                              pey, 0);
   }
-  int fromLocalToGlobal(const int ilocalx, const int ilocaly, const int ilocalz) {
+  int fromLocalToGlobal(const int ilocalx, const int ilocaly, const int ilocalz) const {
     return fromLocalToGlobal(ilocalx, ilocaly, ilocalz, mesh->getXProcIndex(),
                              mesh->getYProcIndex(), 0);
   }
   int fromLocalToGlobal(const int ilocalx, const int ilocaly, const int ilocalz,
-                        const int pex, const int pey, const int pez) {
+                        const int pex, const int pey, const int pez) const {
     ASSERT3(ilocalx >= 0);
     ASSERT3(ilocaly >= 0);
     ASSERT3(ilocalz >= 0);
@@ -102,11 +109,21 @@ class IndConverter {
   }
 };
 
-XZHermiteSpline::XZHermiteSpline(int y_offset, Mesh* meshin)
+template <bool monotonic, implementation_type imp_type>
+XZHermiteSplineBase<monotonic, imp_type>::XZHermiteSplineBase(int y_offset, Mesh* meshin,
+                                                              Options* options)
     : XZInterpolation(y_offset, meshin), h00_x(localmesh), h01_x(localmesh),
       h10_x(localmesh), h11_x(localmesh), h00_z(localmesh), h01_z(localmesh),
       h10_z(localmesh), h11_z(localmesh) {
 
+  if constexpr (monotonic) {
+    if (options == nullptr) {
+      options = &Options::root()["mesh:paralleltransform:xzinterpolation"];
+    }
+    abs_fac_monotonic = (*options)["abs_tol"].withDefault(abs_fac_monotonic);
+    rel_fac_monotonic = (*options)["rel_tol"].withDefault(rel_fac_monotonic);
+  }
+
   // Index arrays contain guard cells in order to get subscripts right
   i_corner.reallocate(localmesh->LocalNx, localmesh->LocalNy, localmesh->LocalNz);
   k_corner.reallocate(localmesh->LocalNx, localmesh->LocalNy, localmesh->LocalNz);
@@ -125,37 +142,48 @@ XZHermiteSpline::XZHermiteSpline(int y_offset, Mesh* meshin)
   h10_z.allocate();
   h11_z.allocate();
 
-#if USE_NEW_WEIGHTS
-  newWeights.reserve(16);
-  for (int w = 0; w < 16; ++w) {
-    newWeights.emplace_back(localmesh);
-    newWeights[w].allocate();
+  if constexpr (imp_type == implementation_type::new_weights
+                || imp_type == implementation_type::petsc) {
+    newWeights.reserve(16);
+    for (int w = 0; w < 16; ++w) {
+      newWeights.emplace_back(localmesh);
+      newWeights[w].allocate();
+    }
   }
-#ifdef HS_USE_PETSC
-  petsclib = new PetscLib(
-      &Options::root()["mesh:paralleltransform:xzinterpolation:hermitespline"]);
-  const int m = localmesh->LocalNx * localmesh->LocalNy * localmesh->LocalNz;
-  const int M = m * localmesh->getNXPE() * localmesh->getNYPE();
-  MatCreateAIJ(BoutComm::get(), m, m, M, M, 16, nullptr, 16, nullptr, &petscWeights);
-#endif
+  if constexpr (imp_type == implementation_type::petsc) {
+#if BOUT_HAS_PETSC
+    petsclib = new PetscLib(
+        &Options::root()["mesh:paralleltransform:xzinterpolation:hermitespline"]);
+    const int m = localmesh->LocalNx * localmesh->LocalNy * localmesh->LocalNz;
+    const int M = m * localmesh->getNXPE() * localmesh->getNYPE() * localmesh->getNZPE();
+    MatCreateAIJ(BoutComm::get(), m, m, M, M, 16, nullptr, 16, nullptr, &petscWeights);
 #endif
-#ifndef HS_USE_PETSC
-  if (localmesh->getNXPE() > 1) {
-    throw BoutException("Require PETSc for MPI splitting in X");
   }
-#endif
+  if constexpr (monotonic) {
+    gf3daccess = std::make_unique<GlobalField3DAccess>(localmesh);
+    g3dinds.reallocate(localmesh->LocalNx, localmesh->LocalNy, localmesh->LocalNz);
+  }
+  if constexpr (imp_type == implementation_type::new_weights
+                || imp_type == implementation_type::legacy) {
+    if (localmesh->getNXPE() > 1) {
+      throw BoutException("Require PETSc for MPI splitting in X");
+    }
+  }
 }
 
-void XZHermiteSpline::calcWeights(const Field3D& delta_x, const Field3D& delta_z,
-                                  const std::string& region) {
+template <bool monotonic, implementation_type imp_type>
+void XZHermiteSplineBase<monotonic, imp_type>::calcWeights(
+    const Field3D& delta_x, const Field3D& delta_z,
+    [[maybe_unused]] const std::string& region) {
 
   const int ny = localmesh->LocalNy;
   const int nz = localmesh->LocalNz;
   const int xend = (localmesh->xend - localmesh->xstart + 1) * localmesh->getNXPE()
                    + localmesh->xstart - 1;
-#ifdef HS_USE_PETSC
-  IndConverter conv{localmesh};
-#endif
+  [[maybe_unused]] const IndConverter conv{localmesh};
+
+  [[maybe_unused]] const int y_global_offset =
+      localmesh->getYProcIndex() * (localmesh->yend - localmesh->ystart + 1);
   BOUT_FOR(i, getRegion(region)) {
     const int x = i.x();
     const int y = i.y();
@@ -223,96 +251,112 @@ void XZHermiteSpline::calcWeights(const Field3D& delta_x, const Field3D& delta_z
     h11_x[i] = (t_x * t_x * t_x) - (t_x * t_x);
     h11_z[i] = (t_z * t_z * t_z) - (t_z * t_z);
 
-#if USE_NEW_WEIGHTS
-
-    for (int w = 0; w < 16; ++w) {
-      newWeights[w][i] = 0;
-    }
-    // The distribution of our weights:
-    //  0   4   8    12
-    //  1   5   9    13
-    //  2   6   10   14
-    //  3   7   11   15
-    // e.g. 1 == ic.xm(); 4 == ic.zm(); 5 == ic;  7 == ic.zp(2);
-
-    // f[ic] * h00_x[i] + f[icxp] * h01_x[i] + fx[ic] * h10_x[i] + fx[icxp] * h11_x[i];
-    newWeights[5][i] += h00_x[i] * h00_z[i];
-    newWeights[9][i] += h01_x[i] * h00_z[i];
-    newWeights[9][i] += h10_x[i] * h00_z[i] / 2;
-    newWeights[1][i] -= h10_x[i] * h00_z[i] / 2;
-    newWeights[13][i] += h11_x[i] * h00_z[i] / 2;
-    newWeights[5][i] -= h11_x[i] * h00_z[i] / 2;
-
-    // f[iczp] * h00_x[i] + f[icxpzp] * h01_x[i] +
-    // fx[iczp] * h10_x[i] + fx[icxpzp] * h11_x[i];
-    newWeights[6][i] += h00_x[i] * h01_z[i];
-    newWeights[10][i] += h01_x[i] * h01_z[i];
-    newWeights[10][i] += h10_x[i] * h01_z[i] / 2;
-    newWeights[2][i] -= h10_x[i] * h01_z[i] / 2;
-    newWeights[14][i] += h11_x[i] * h01_z[i] / 2;
-    newWeights[6][i] -= h11_x[i] * h01_z[i] / 2;
-
-    // fz[ic] * h00_x[i] + fz[icxp] * h01_x[i] +
-    // fxz[ic] * h10_x[i]+ fxz[icxp] * h11_x[i];
-    newWeights[6][i] += h00_x[i] * h10_z[i] / 2;
-    newWeights[4][i] -= h00_x[i] * h10_z[i] / 2;
-    newWeights[10][i] += h01_x[i] * h10_z[i] / 2;
-    newWeights[8][i] -= h01_x[i] * h10_z[i] / 2;
-    newWeights[10][i] += h10_x[i] * h10_z[i] / 4;
-    newWeights[8][i] -= h10_x[i] * h10_z[i] / 4;
-    newWeights[2][i] -= h10_x[i] * h10_z[i] / 4;
-    newWeights[0][i] += h10_x[i] * h10_z[i] / 4;
-    newWeights[14][i] += h11_x[i] * h10_z[i] / 4;
-    newWeights[12][i] -= h11_x[i] * h10_z[i] / 4;
-    newWeights[6][i] -= h11_x[i] * h10_z[i] / 4;
-    newWeights[4][i] += h11_x[i] * h10_z[i] / 4;
-
-    // fz[iczp] * h00_x[i] + fz[icxpzp] * h01_x[i] +
-    // fxz[iczp] * h10_x[i] + fxz[icxpzp] * h11_x[i];
-    newWeights[7][i] += h00_x[i] * h11_z[i] / 2;
-    newWeights[5][i] -= h00_x[i] * h11_z[i] / 2;
-    newWeights[11][i] += h01_x[i] * h11_z[i] / 2;
-    newWeights[9][i] -= h01_x[i] * h11_z[i] / 2;
-    newWeights[11][i] += h10_x[i] * h11_z[i] / 4;
-    newWeights[9][i] -= h10_x[i] * h11_z[i] / 4;
-    newWeights[3][i] -= h10_x[i] * h11_z[i] / 4;
-    newWeights[1][i] += h10_x[i] * h11_z[i] / 4;
-    newWeights[15][i] += h11_x[i] * h11_z[i] / 4;
-    newWeights[13][i] -= h11_x[i] * h11_z[i] / 4;
-    newWeights[7][i] -= h11_x[i] * h11_z[i] / 4;
-    newWeights[5][i] += h11_x[i] * h11_z[i] / 4;
-#ifdef HS_USE_PETSC
-    PetscInt idxn[1] = {conv.fromLocalToGlobal(x, y + y_offset, z)};
-    // output.write("debug: {:d} -> {:d}: {:d}:{:d} -> {:d}:{:d}\n",
-    // conv.fromLocalToGlobal(x, y + y_offset, z),
-    //     conv.fromMeshToGlobal(i_corn, y + y_offset, k_corner(x, y, z)),
-    //     x, z, i_corn, k_corner(x, y, z));
-    // ixstep = mesh->LocalNx * mesh->LocalNz;
-    for (int j = 0; j < 4; ++j) {
-      PetscInt idxm[4];
-      PetscScalar vals[4];
-      for (int k = 0; k < 4; ++k) {
-        idxm[k] = conv.fromMeshToGlobal(i_corn - 1 + j, y + y_offset,
-                                        k_corner(x, y, z) - 1 + k);
-        vals[k] = newWeights[j * 4 + k][i];
+    if constexpr (imp_type == implementation_type::new_weights
+                  || imp_type == implementation_type::petsc) {
+      for (int w = 0; w < 16; ++w) {
+        newWeights[w][i] = 0;
       }
-      MatSetValues(petscWeights, 1, idxn, 4, idxm, vals, INSERT_VALUES);
-    }
-#endif
+      // The distribution of our weights:
+      //  0   4   8    12
+      //  1   5   9    13
+      //  2   6   10   14
+      //  3   7   11   15
+      // e.g. 1 == ic.xm(); 4 == ic.zm(); 5 == ic;  7 == ic.zp(2);
+
+      // f[ic] * h00_x[i] + f[icxp] * h01_x[i] + fx[ic] * h10_x[i] + fx[icxp] * h11_x[i];
+      newWeights[5][i] += h00_x[i] * h00_z[i];
+      newWeights[9][i] += h01_x[i] * h00_z[i];
+      newWeights[9][i] += h10_x[i] * h00_z[i] / 2;
+      newWeights[1][i] -= h10_x[i] * h00_z[i] / 2;
+      newWeights[13][i] += h11_x[i] * h00_z[i] / 2;
+      newWeights[5][i] -= h11_x[i] * h00_z[i] / 2;
+
+      // f[iczp] * h00_x[i] + f[icxpzp] * h01_x[i] +
+      // fx[iczp] * h10_x[i] + fx[icxpzp] * h11_x[i];
+      newWeights[6][i] += h00_x[i] * h01_z[i];
+      newWeights[10][i] += h01_x[i] * h01_z[i];
+      newWeights[10][i] += h10_x[i] * h01_z[i] / 2;
+      newWeights[2][i] -= h10_x[i] * h01_z[i] / 2;
+      newWeights[14][i] += h11_x[i] * h01_z[i] / 2;
+      newWeights[6][i] -= h11_x[i] * h01_z[i] / 2;
+
+      // fz[ic] * h00_x[i] + fz[icxp] * h01_x[i] +
+      // fxz[ic] * h10_x[i]+ fxz[icxp] * h11_x[i];
+      newWeights[6][i] += h00_x[i] * h10_z[i] / 2;
+      newWeights[4][i] -= h00_x[i] * h10_z[i] / 2;
+      newWeights[10][i] += h01_x[i] * h10_z[i] / 2;
+      newWeights[8][i] -= h01_x[i] * h10_z[i] / 2;
+      newWeights[10][i] += h10_x[i] * h10_z[i] / 4;
+      newWeights[8][i] -= h10_x[i] * h10_z[i] / 4;
+      newWeights[2][i] -= h10_x[i] * h10_z[i] / 4;
+      newWeights[0][i] += h10_x[i] * h10_z[i] / 4;
+      newWeights[14][i] += h11_x[i] * h10_z[i] / 4;
+      newWeights[12][i] -= h11_x[i] * h10_z[i] / 4;
+      newWeights[6][i] -= h11_x[i] * h10_z[i] / 4;
+      newWeights[4][i] += h11_x[i] * h10_z[i] / 4;
+
+      // fz[iczp] * h00_x[i] + fz[icxpzp] * h01_x[i] +
+      // fxz[iczp] * h10_x[i] + fxz[icxpzp] * h11_x[i];
+      newWeights[7][i] += h00_x[i] * h11_z[i] / 2;
+      newWeights[5][i] -= h00_x[i] * h11_z[i] / 2;
+      newWeights[11][i] += h01_x[i] * h11_z[i] / 2;
+      newWeights[9][i] -= h01_x[i] * h11_z[i] / 2;
+      newWeights[11][i] += h10_x[i] * h11_z[i] / 4;
+      newWeights[9][i] -= h10_x[i] * h11_z[i] / 4;
+      newWeights[3][i] -= h10_x[i] * h11_z[i] / 4;
+      newWeights[1][i] += h10_x[i] * h11_z[i] / 4;
+      newWeights[15][i] += h11_x[i] * h11_z[i] / 4;
+      newWeights[13][i] -= h11_x[i] * h11_z[i] / 4;
+      newWeights[7][i] -= h11_x[i] * h11_z[i] / 4;
+      newWeights[5][i] += h11_x[i] * h11_z[i] / 4;
+      if (imp_type == implementation_type::petsc) {
+#if BOUT_HAS_PETSC
+        PetscInt idxn[1] = {conv.fromLocalToGlobal(x, y + y_offset, z)};
+        // output.write("debug: {:d} -> {:d}: {:d}:{:d} -> {:d}:{:d}\n",
+        // conv.fromLocalToGlobal(x, y + y_offset, z),
+        //     conv.fromMeshToGlobal(i_corn, y + y_offset, k_corner(x, y, z)),
+        //     x, z, i_corn, k_corner(x, y, z));
+        // ixstep = mesh->LocalNx * mesh->LocalNz;
+        for (int j = 0; j < 4; ++j) {
+          PetscInt idxm[4];
+          PetscScalar vals[4];
+          for (int k = 0; k < 4; ++k) {
+            idxm[k] = conv.fromMeshToGlobal(i_corn - 1 + j, y + y_offset,
+                                            k_corner(x, y, z) - 1 + k);
+            vals[k] = newWeights[(j * 4) + k][i];
+          }
+          MatSetValues(petscWeights, 1, idxn, 4, idxm, vals, INSERT_VALUES);
+        }
 #endif
+      }
+    }
+    if constexpr (monotonic) {
+      const auto gind = gf3daccess->xyzglobal(i_corn, y + y_offset + y_global_offset,
+                                              k_corner(x, y, z));
+      gf3daccess->request(gind);
+      gf3daccess->request(gind.xp(1));
+      gf3daccess->request(gind.zp(1));
+      gf3daccess->request(gind.xp(1).zp(1));
+      g3dinds[i] = {gind.ind, gind.xp(1).ind, gind.zp(1).ind, gind.xp(1).zp(1).ind};
+    }
   }
-#ifdef HS_USE_PETSC
-  MatAssemblyBegin(petscWeights, MAT_FINAL_ASSEMBLY);
-  MatAssemblyEnd(petscWeights, MAT_FINAL_ASSEMBLY);
-  if (!isInit) {
-    MatCreateVecs(petscWeights, &rhs, &result);
-  }
-  isInit = true;
+  if constexpr (imp_type == implementation_type::petsc) {
+#if BOUT_HAS_PETSC
+    MatAssemblyBegin(petscWeights, MAT_FINAL_ASSEMBLY);
+    MatAssemblyEnd(petscWeights, MAT_FINAL_ASSEMBLY);
+    if (!isInit) {
+      MatCreateVecs(petscWeights, &rhs, &result);
+    }
+    isInit = true;
 #endif
+  }
 }
 
-void XZHermiteSpline::calcWeights(const Field3D& delta_x, const Field3D& delta_z,
-                                  const BoutMask& mask, const std::string& region) {
+template <bool monotonic, implementation_type imp_type>
+void XZHermiteSplineBase<monotonic, imp_type>::calcWeights(const Field3D& delta_x,
+                                                           const Field3D& delta_z,
+                                                           const BoutMask& mask,
+                                                           const std::string& region) {
   setMask(mask);
   calcWeights(delta_x, delta_z, region);
 }
@@ -333,8 +377,14 @@ void XZHermiteSpline::calcWeights(const Field3D& delta_x, const Field3D& delta_z
  *   (i, j+1, k+1)	h01_z + h10_z / 2
  *   (i, j+1, k+2)	h11_z / 2
  */
+template <bool monotonic, implementation_type imp_type>
 std::vector<ParallelTransform::PositionsAndWeights>
-XZHermiteSpline::getWeightsForYApproximation(int i, int j, int k, int yoffset) {
+XZHermiteSplineBase<monotonic, imp_type>::getWeightsForYApproximation(int i, int j, int k,
+                                                                      int yoffset) {
+  if (localmesh->getNXPE() > 1) {
+    throw BoutException("It is likely that the function calling this is not handling the "
+                        "result correctly.");
+  }
   const int nz = localmesh->LocalNz;
   const int k_mod = k_corner(i, j, k);
   const int k_mod_m1 = (k_mod > 0) ? (k_mod - 1) : (nz - 1);
@@ -347,99 +397,157 @@ XZHermiteSpline::getWeightsForYApproximation(int i, int j, int k, int yoffset) {
           {i, j + yoffset, k_mod_p2, 0.5 * h11_z(i, j, k)}};
 }
 
-Field3D XZHermiteSpline::interpolate(const Field3D& f, const std::string& region) const {
+template <bool monotonic, implementation_type imp_type>
+Field3D XZHermiteSplineBase<monotonic, imp_type>::interpolate(
+    const Field3D& f, [[maybe_unused]] const std::string& region) const {
 
   ASSERT1(f.getMesh() == localmesh);
   Field3D f_interp{emptyFrom(f)};
 
-  const auto region2 =
-      y_offset == 0 ? "RGN_NOY" : fmt::format("RGN_YPAR_{:+d}", y_offset);
-
-#if USE_NEW_WEIGHTS
-#ifdef HS_USE_PETSC
-  BoutReal* ptr;
-  const BoutReal* cptr;
-  VecGetArray(rhs, &ptr);
-  BOUT_FOR(i, f.getRegion("RGN_NOY")) { ptr[int(i)] = f[i]; }
-  VecRestoreArray(rhs, &ptr);
-  MatMult(petscWeights, rhs, result);
-  VecGetArrayRead(result, &cptr);
-  BOUT_FOR(i, f.getRegion(region2)) {
-    f_interp[i] = cptr[int(i)];
-    ASSERT2(std::isfinite(cptr[int(i)]));
+  const auto region2 = y_offset != 0 ? fmt::format("RGN_YPAR_{:+d}", y_offset) : region;
+
+  std::unique_ptr<GlobalField3DAccessInstance> gf;
+  if constexpr (monotonic) {
+    gf = gf3daccess->communicate_asPtr(f);
   }
-  VecRestoreArrayRead(result, &cptr);
-#else
-  BOUT_FOR(i, getRegion(region)) {
-    auto ic = i_corner[i];
-    auto iyp = i.yp(y_offset);
-
-    f_interp[iyp] = 0;
-    for (int w = 0; w < 4; ++w) {
-      f_interp[iyp] += newWeights[w * 4 + 0][i] * f[ic.zm().xp(w - 1)];
-      f_interp[iyp] += newWeights[w * 4 + 1][i] * f[ic.xp(w - 1)];
-      f_interp[iyp] += newWeights[w * 4 + 2][i] * f[ic.zp().xp(w - 1)];
-      f_interp[iyp] += newWeights[w * 4 + 3][i] * f[ic.zp(2).xp(w - 1)];
+
+  if constexpr (imp_type == implementation_type::petsc) {
+#if BOUT_HAS_PETSC
+    BoutReal* ptr = nullptr;
+    const BoutReal* cptr = nullptr;
+    VecGetArray(rhs, &ptr);
+    BOUT_FOR(i, f.getRegion("RGN_NOY")) { ptr[int(i)] = f[i]; }
+    VecRestoreArray(rhs, &ptr);
+    MatMult(petscWeights, rhs, result);
+    VecGetArrayRead(result, &cptr);
+    BOUT_FOR(i, f.getRegion(region2)) {
+      f_interp[i] = cptr[int(i)];
+      if constexpr (monotonic) {
+        const auto iyp = i;
+        const auto i = iyp.ym(y_offset);
+        const auto corners = {(*gf)[IndG3D(g3dinds[i][0])], (*gf)[IndG3D(g3dinds[i][1])],
+                              (*gf)[IndG3D(g3dinds[i][2])], (*gf)[IndG3D(g3dinds[i][3])]};
+        const auto minmax = std::minmax(corners);
+        const auto diff =
+            ((minmax.second - minmax.first) * rel_fac_monotonic) + abs_fac_monotonic;
+        f_interp[iyp] = std::max(f_interp[iyp], minmax.first - diff);
+        f_interp[iyp] = std::min(f_interp[iyp], minmax.second + diff);
+      }
+      ASSERT2(std::isfinite(cptr[int(i)]));
     }
-  }
+    VecRestoreArrayRead(result, &cptr);
 #endif
-#else
-  // Derivatives are used for tension and need to be on dimensionless
-  // coordinates
-
-  // f has been communcated, and thus we can assume that the x-boundaries are
-  // also valid in the y-boundary.  Thus the differentiated field needs no
-  // extra comms.
-  Field3D fx = bout::derivatives::index::DDX(f, CELL_DEFAULT, "DEFAULT", region2);
-  Field3D fz = bout::derivatives::index::DDZ(f, CELL_DEFAULT, "DEFAULT", region2);
-  Field3D fxz = bout::derivatives::index::DDZ(fx, CELL_DEFAULT, "DEFAULT", region2);
-
-  BOUT_FOR(i, getRegion(region)) {
-    const auto iyp = i.yp(y_offset);
-
-    const auto ic = i_corner[i];
-    const auto iczp = ic.zp();
-    const auto icxp = ic.xp();
-    const auto icxpzp = iczp.xp();
-
-    // Interpolate f in X at Z
-    const BoutReal f_z =
-        f[ic] * h00_x[i] + f[icxp] * h01_x[i] + fx[ic] * h10_x[i] + fx[icxp] * h11_x[i];
-
-    // Interpolate f in X at Z+1
-    const BoutReal f_zp1 = f[iczp] * h00_x[i] + f[icxpzp] * h01_x[i] + fx[iczp] * h10_x[i]
-                           + fx[icxpzp] * h11_x[i];
-
-    // Interpolate fz in X at Z
-    const BoutReal fz_z = fz[ic] * h00_x[i] + fz[icxp] * h01_x[i] + fxz[ic] * h10_x[i]
-                          + fxz[icxp] * h11_x[i];
-
-    // Interpolate fz in X at Z+1
-    const BoutReal fz_zp1 = fz[iczp] * h00_x[i] + fz[icxpzp] * h01_x[i]
-                            + fxz[iczp] * h10_x[i] + fxz[icxpzp] * h11_x[i];
-
-    // Interpolate in Z
-    f_interp[iyp] =
-        +f_z * h00_z[i] + f_zp1 * h01_z[i] + fz_z * h10_z[i] + fz_zp1 * h11_z[i];
+  }
 
-    ASSERT2(std::isfinite(f_interp[iyp]) || i.x() < localmesh->xstart
-            || i.x() > localmesh->xend);
+  if constexpr (imp_type == implementation_type::new_weights) {
+    BOUT_FOR(i, getRegion(region)) {
+      auto ic = i_corner[i];
+      auto iyp = i.yp(y_offset);
+
+      f_interp[iyp] = 0;
+      for (int w = 0; w < 4; ++w) {
+        f_interp[iyp] += newWeights[(w * 4) + 0][i] * f[ic.zm().xp(w - 1)];
+        f_interp[iyp] += newWeights[(w * 4) + 1][i] * f[ic.xp(w - 1)];
+        f_interp[iyp] += newWeights[(w * 4) + 2][i] * f[ic.zp().xp(w - 1)];
+        f_interp[iyp] += newWeights[(w * 4) + 3][i] * f[ic.zp(2).xp(w - 1)];
+      }
+      if constexpr (monotonic) {
+        const auto corners = {(*gf)[IndG3D(g3dinds[i][0])], (*gf)[IndG3D(g3dinds[i][1])],
+                              (*gf)[IndG3D(g3dinds[i][2])], (*gf)[IndG3D(g3dinds[i][3])]};
+        const auto minmax = std::minmax(corners);
+        const auto diff =
+            ((minmax.second - minmax.first) * rel_fac_monotonic) + abs_fac_monotonic;
+        f_interp[iyp] = std::max(f_interp[iyp], minmax.first - diff);
+        f_interp[iyp] = std::min(f_interp[iyp], minmax.second + diff);
+      }
+      ASSERT2(std::isfinite(f_interp[iyp]));
+    }
+  }
+  if constexpr (imp_type == implementation_type::legacy) {
+    // Legacy interpolation
+    // TODO(peter): Should we apply dirichlet BCs to derivatives?
+    // Derivatives are used for tension and need to be on dimensionless
+    // coordinates
+
+    // f has been communcated, and thus we can assume that the x-boundaries are
+    // also valid in the y-boundary.  Thus the differentiated field needs no
+    // extra comms.
+    // TODO(dave) Add assert that we do not use z-splitting or z-guards.
+    Field3D fx = bout::derivatives::index::DDX(f, CELL_DEFAULT, "DEFAULT", region2);
+    Field3D fz = bout::derivatives::index::DDZ(f, CELL_DEFAULT, "DEFAULT", region2);
+    Field3D fxz = bout::derivatives::index::DDZ(fx, CELL_DEFAULT, "DEFAULT", region2);
+
+    BOUT_FOR(i, getRegion(region)) {
+      const auto iyp = i.yp(y_offset);
+
+      const auto ic = i_corner[i];
+      const auto iczp = ic.zp();
+      const auto icxp = ic.xp();
+      const auto icxpzp = iczp.xp();
+
+      // Interpolate f in X at Z
+      const BoutReal f_z =
+          f[ic] * h00_x[i] + f[icxp] * h01_x[i] + fx[ic] * h10_x[i] + fx[icxp] * h11_x[i];
+
+      // Interpolate f in X at Z+1
+      const BoutReal f_zp1 = f[iczp] * h00_x[i] + f[icxpzp] * h01_x[i]
+                             + fx[iczp] * h10_x[i] + fx[icxpzp] * h11_x[i];
+
+      // Interpolate fz in X at Z
+      const BoutReal fz_z = fz[ic] * h00_x[i] + fz[icxp] * h01_x[i] + fxz[ic] * h10_x[i]
+                            + fxz[icxp] * h11_x[i];
+
+      // Interpolate fz in X at Z+1
+      const BoutReal fz_zp1 = fz[iczp] * h00_x[i] + fz[icxpzp] * h01_x[i]
+                              + fxz[iczp] * h10_x[i] + fxz[icxpzp] * h11_x[i];
+
+      // Interpolate in Z
+      f_interp[iyp] =
+          +f_z * h00_z[i] + f_zp1 * h01_z[i] + fz_z * h10_z[i] + fz_zp1 * h11_z[i];
+
+      if constexpr (monotonic) {
+        const auto corners = {(*gf)[IndG3D(g3dinds[i][0])], (*gf)[IndG3D(g3dinds[i][1])],
+                              (*gf)[IndG3D(g3dinds[i][2])], (*gf)[IndG3D(g3dinds[i][3])]};
+        const auto minmax = std::minmax(corners);
+        const auto diff =
+            ((minmax.second - minmax.first) * rel_fac_monotonic) + abs_fac_monotonic;
+        f_interp[iyp] = std::max(f_interp[iyp], minmax.first - diff);
+        f_interp[iyp] = std::min(f_interp[iyp], minmax.second + diff);
+      }
+      ASSERT2(std::isfinite(f_interp[iyp]) || i.x() < localmesh->xstart
+              || i.x() > localmesh->xend);
+    }
   }
-#endif
   f_interp.setRegion(region2);
   ASSERT2(f_interp.getRegionID());
   return f_interp;
 }
 
-Field3D XZHermiteSpline::interpolate(const Field3D& f, const Field3D& delta_x,
-                                     const Field3D& delta_z, const std::string& region) {
+template <bool monotonic, implementation_type imp_type>
+Field3D XZHermiteSplineBase<monotonic, imp_type>::interpolate(const Field3D& f,
+                                                              const Field3D& delta_x,
+                                                              const Field3D& delta_z,
+                                                              const std::string& region) {
   calcWeights(delta_x, delta_z, region);
   return interpolate(f, region);
 }
 
-Field3D XZHermiteSpline::interpolate(const Field3D& f, const Field3D& delta_x,
-                                     const Field3D& delta_z, const BoutMask& mask,
-                                     const std::string& region) {
+template <bool monotonic, implementation_type imp_type>
+Field3D XZHermiteSplineBase<monotonic, imp_type>::interpolate(const Field3D& f,
+                                                              const Field3D& delta_x,
+                                                              const Field3D& delta_z,
+                                                              const BoutMask& mask,
+                                                              const std::string& region) {
   calcWeights(delta_x, delta_z, mask, region);
   return interpolate(f, region);
 }
+
+// ensure they are instantiated
+template class XZHermiteSplineBase<true, implementation_type::new_weights>;
+template class XZHermiteSplineBase<false, implementation_type::new_weights>;
+template class XZHermiteSplineBase<true, implementation_type::legacy>;
+template class XZHermiteSplineBase<false, implementation_type::legacy>;
+#if BOUT_HAS_PETSC
+template class XZHermiteSplineBase<true, implementation_type::petsc>;
+template class XZHermiteSplineBase<false, implementation_type::petsc>;
+#endif
diff --git a/src/mesh/interpolation/lagrange_4pt_xz.cxx b/src/mesh/interpolation/lagrange_4pt_xz.cxx
index e16b2699d1..c82c9fa36e 100644
--- a/src/mesh/interpolation/lagrange_4pt_xz.cxx
+++ b/src/mesh/interpolation/lagrange_4pt_xz.cxx
@@ -20,6 +20,7 @@
  *
  **************************************************************************/
 
+#include "fmt/format.h"
 #include "bout/globals.hxx"
 #include "bout/interpolation_xz.hxx"
 #include "bout/mesh.hxx"
@@ -133,7 +134,10 @@ Field3D XZLagrange4pt::interpolate(const Field3D& f, const std::string& region)
 
     // Then in X
     f_interp(x, y_next, z) = lagrange_4pt(xvals, t_x(x, y, z));
+    ASSERT2(std::isfinite(f_interp(x, y_next, z)));
   }
+  const auto region2 = y_offset != 0 ? fmt::format("RGN_YPAR_{:+d}", y_offset) : region;
+  f_interp.setRegion(region2);
   return f_interp;
 }
 
diff --git a/src/mesh/interpolation/monotonic_hermite_spline_xz.cxx b/src/mesh/interpolation/monotonic_hermite_spline_xz.cxx
deleted file mode 100644
index f206ed1e0f..0000000000
--- a/src/mesh/interpolation/monotonic_hermite_spline_xz.cxx
+++ /dev/null
@@ -1,98 +0,0 @@
-/**************************************************************************
- * Copyright 2018 B.D.Dudson, P. Hill
- *
- * Contact: Ben Dudson, bd512@york.ac.uk
- *
- * This file is part of BOUT++.
- *
- * BOUT++ is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * BOUT++ is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
- *
- **************************************************************************/
-
-#include "bout/globals.hxx"
-#include "bout/index_derivs_interface.hxx"
-#include "bout/interpolation_xz.hxx"
-#include "bout/mesh.hxx"
-
-#include <algorithm>
-
-Field3D XZMonotonicHermiteSpline::interpolate(const Field3D& f,
-                                              const std::string& region) const {
-  ASSERT1(f.getMesh() == localmesh);
-  Field3D f_interp(f.getMesh());
-  f_interp.allocate();
-
-  // Derivatives are used for tension and need to be on dimensionless
-  // coordinates
-  Field3D fx = bout::derivatives::index::DDX(f, CELL_DEFAULT, "DEFAULT");
-  Field3D fz = bout::derivatives::index::DDZ(f, CELL_DEFAULT, "DEFAULT", "RGN_ALL");
-  localmesh->communicate_no_slices(fx, fz);
-  Field3D fxz = bout::derivatives::index::DDX(fz, CELL_DEFAULT, "DEFAULT");
-  localmesh->communicate_no_slices(fxz);
-
-  const auto curregion{getRegion(region)};
-  BOUT_FOR(i, curregion) {
-    const auto iyp = i.yp(y_offset);
-
-    const auto ic = i_corner[i];
-    const auto iczp = ic.zp();
-    const auto icxp = ic.xp();
-    const auto icxpzp = iczp.xp();
-
-    // Interpolate f in X at Z
-    const BoutReal f_z =
-        f[ic] * h00_x[i] + f[icxp] * h01_x[i] + fx[ic] * h10_x[i] + fx[icxp] * h11_x[i];
-
-    // Interpolate f in X at Z+1
-    const BoutReal f_zp1 = f[iczp] * h00_x[i] + f[icxpzp] * h01_x[i] + fx[iczp] * h10_x[i]
-                           + fx[icxpzp] * h11_x[i];
-
-    // Interpolate fz in X at Z
-    const BoutReal fz_z = fz[ic] * h00_x[i] + fz[icxp] * h01_x[i] + fxz[ic] * h10_x[i]
-                          + fxz[icxp] * h11_x[i];
-
-    // Interpolate fz in X at Z+1
-    const BoutReal fz_zp1 = fz[iczp] * h00_x[i] + fz[icxpzp] * h01_x[i]
-                            + fxz[iczp] * h10_x[i] + fxz[icxpzp] * h11_x[i];
-
-    // Interpolate in Z
-    BoutReal result =
-        +f_z * h00_z[i] + f_zp1 * h01_z[i] + fz_z * h10_z[i] + fz_zp1 * h11_z[i];
-
-    ASSERT2(std::isfinite(result) || i.x() < localmesh->xstart
-            || i.x() > localmesh->xend);
-
-    // Monotonicity
-    // Force the interpolated result to be in the range of the
-    // neighbouring cell values. This prevents unphysical overshoots,
-    // but also degrades accuracy near maxima and minima.
-    // Perhaps should only impose near boundaries, since that is where
-    // problems most obviously occur.
-    const BoutReal localmax = BOUTMAX(f[ic], f[icxp], f[iczp], f[icxpzp]);
-    const BoutReal localmin = BOUTMIN(f[ic], f[icxp], f[iczp], f[icxpzp]);
-
-    ASSERT2(std::isfinite(localmax) || i.x() < localmesh->xstart
-            || i.x() > localmesh->xend);
-    ASSERT2(std::isfinite(localmin) || i.x() < localmesh->xstart
-            || i.x() > localmesh->xend);
-
-    const auto diff = ((localmax - localmin) * rtol) + atol;
-
-    result = std::min(result, localmax + diff);
-    result = std::max(result, localmin - diff);
-
-    f_interp[iyp] = result;
-  }
-  return f_interp;
-}
diff --git a/src/mesh/interpolation_xz.cxx b/src/mesh/interpolation_xz.cxx
index a58554dc82..ec8bcc0502 100644
--- a/src/mesh/interpolation_xz.cxx
+++ b/src/mesh/interpolation_xz.cxx
@@ -23,6 +23,7 @@
  *
  **************************************************************************/
 
+#include "parallel/fci_comm.hxx"
 #include <bout/globals.hxx>
 #include <bout/interpolation_xz.hxx>
 #include <bout/output.hxx>
@@ -86,6 +87,14 @@ namespace {
 RegisterXZInterpolation<XZHermiteSpline> registerinterphermitespline{"hermitespline"};
 RegisterXZInterpolation<XZMonotonicHermiteSpline> registerinterpmonotonichermitespline{
     "monotonichermitespline"};
+RegisterXZInterpolation<XZHermiteSplineSerial> registerinterphermitesplines{
+    "hermitesplineserial"};
+RegisterXZInterpolation<XZMonotonicHermiteSplineSerial>
+    registerinterpmonotonichermitesplines{"monotonichermitesplineserial"};
+RegisterXZInterpolation<XZHermiteSplineLegacy> registerinterphermitesplinel{
+    "hermitesplinelegacy"};
+RegisterXZInterpolation<XZMonotonicHermiteSplineLegacy>
+    registerinterpmonotonichermitesplinel{"monotonichermitesplinelegacy"};
 RegisterXZInterpolation<XZLagrange4pt> registerinterplagrange4pt{"lagrange4pt"};
 RegisterXZInterpolation<XZBilinear> registerinterpbilinear{"bilinear"};
 } // namespace
diff --git a/src/mesh/parallel/fci_comm.cxx b/src/mesh/parallel/fci_comm.cxx
new file mode 100644
index 0000000000..3d213ad6e4
--- /dev/null
+++ b/src/mesh/parallel/fci_comm.cxx
@@ -0,0 +1,224 @@
+/**************************************************************************
+ * Communication for Flux-coordinate Independent interpolation
+ *
+ **************************************************************************
+ * Copyright 2025 BOUT++ contributors
+ *
+ * Contact: Ben Dudson, dudson2@llnl.gov
+ *
+ * This file is part of BOUT++.
+ *
+ * BOUT++ is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * BOUT++ is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **************************************************************************/
+
+#include "fci_comm.hxx"
+#include "bout/assert.hxx"
+#include "bout/bout_types.hxx"
+#include "bout/field3d.hxx"
+#include "bout/region.hxx"
+
+#include <algorithm>
+#include <cstddef>
+#include <mpi.h>
+#include <vector>
+
+fci_comm::ProcLocal fci_comm::GlobalToLocal1D::convert(int id) const {
+  if (periodic) {
+    while (id < mg) {
+      id += global;
+    }
+    while (id >= global + mg) {
+      id -= global;
+    }
+  }
+  const int idwo = id - mg;
+  int proc = idwo / local;
+  if (not periodic) {
+    if (proc >= npe) {
+      proc = npe - 1;
+    }
+  }
+  const int loc = id - (local * proc);
+#if CHECK > 1
+  ASSERT1(loc >= 0);
+  ASSERT1(loc <= localwith);
+  ASSERT1(proc >= 0);
+  ASSERT1(proc < npe);
+  if (periodic and (loc < mg or loc >= local + mg)) {
+    throw BoutException(
+        "GlobalToLocal1D failure - expected {} < {} < {} because we are periodic\n", mg,
+        loc, local + mg);
+  }
+#endif
+  return {proc, loc};
+}
+
+void GlobalField3DAccess::setup() {
+  // We need to send a list of data to every processor of which data
+  // we want We also get a list of all the data that every other
+  // processor wants Some of theses lists may be empty. We still need
+  // to send them, later we can skip that.  We also compute where the
+  // requested data will be stored later on. This is currently
+  // implemented as a map, to memory efficient, as the data is
+  // sparse. We could also store the mapping as a dense array, if it
+  // turns out this lookup is not fast enough, but that may limit
+  // scaling at some point.
+  ASSERT2(is_setup == false);
+#ifdef _OPENMP
+  for (auto& o_id : openmp_ids) {
+    ids.merge(o_id);
+  }
+  openmp_ids.clear();
+#endif
+  toGet.resize(static_cast<size_t>(global2local_x.getNPE() * global2local_y.getNPE()
+                                   * global2local_z.getNPE()));
+  for (const auto id : ids) {
+    const IndG3D gind{id, global2local_y.getGlobalWith(), global2local_z.getGlobalWith()};
+    const auto pix = global2local_x.convert(gind.x());
+    const auto piy = global2local_y.convert(gind.y());
+    const auto piz = global2local_z.convert(gind.z());
+    ASSERT3(piz.proc == 0);
+    toGet[mesh->getProcIndex(pix.proc, piy.proc, piz.proc)].push_back(
+        xyzlocal.convert(pix.index, piy.index, piz.index).ind);
+  }
+  for (auto& v : toGet) {
+    std::sort(v.begin(), v.end());
+  }
+  commCommLists();
+  {
+    int offset = 0;
+    for (const auto& get : toGet) {
+      getOffsets.push_back(offset);
+      offset += get.size();
+    }
+    getOffsets.push_back(offset);
+  }
+  for (const auto id : ids) {
+    const IndG3D gind{id, global2local_y.getGlobalWith(), global2local_z.getGlobalWith()};
+    const auto pix = global2local_x.convert(gind.x());
+    const auto piy = global2local_y.convert(gind.y());
+    const auto piz = global2local_z.convert(gind.z());
+    ASSERT3(piz.proc == 0);
+    const auto proc = mesh->getProcIndex(pix.proc, piy.proc, piz.proc);
+    const auto& vec = toGet[proc];
+    const auto tofind = xyzlocal.convert(pix.index, piy.index, piz.index).ind;
+    auto it = std::lower_bound(vec.begin(), vec.end(), tofind);
+    ASSERT3(it != vec.end());
+    ASSERT3(*it == tofind);
+    mapping[id] = std::distance(vec.begin(), it) + getOffsets[proc];
+  }
+  is_setup = true;
+}
+
+void GlobalField3DAccess::commCommLists() {
+  toSend.resize(toGet.size());
+  std::vector<int> toGetSizes(toGet.size(), -1);
+  std::vector<int> toSendSizes(toSend.size(), -1);
+#if CHECK > 3
+  {
+    int thisproc;
+    MPI_Comm_rank(comm, &thisproc);
+    ASSERT0(thisproc
+            == mesh->getProcIndex(mesh->getXProcIndex(), mesh->getYProcIndex(),
+                                  mesh->getZProcIndex()));
+  }
+#endif
+  std::vector<MPI_Request> reqs(toSend.size());
+  for (size_t proc = 0; proc < toGet.size(); ++proc) {
+    auto ret = MPI_Irecv(&toSendSizes[proc], 1, MPI_INT, proc, 666, comm, &reqs[proc]);
+    ASSERT0(ret == MPI_SUCCESS);
+  }
+  for (size_t proc = 0; proc < toGet.size(); ++proc) {
+    toGetSizes[proc] = toGet[proc].size();
+    auto ret = MPI_Send(&toGetSizes[proc], 1, MPI_INT, proc, 666, comm);
+    ASSERT0(ret == MPI_SUCCESS);
+  }
+  std::vector<MPI_Request> reqs2(toSend.size());
+  int cnt = 0;
+  for ([[maybe_unused]] auto dummy : reqs) {
+    int ind{0};
+    auto ret = MPI_Waitany(reqs.size(), reqs.data(), &ind, MPI_STATUS_IGNORE);
+    ASSERT0(ret == MPI_SUCCESS);
+    ASSERT3(ind != MPI_UNDEFINED);
+    ASSERT2(static_cast<size_t>(ind) < toSend.size());
+    ASSERT3(toSendSizes[ind] >= 0);
+    if (toSendSizes[ind] == 0) {
+      continue;
+    }
+    sendBufferSize += toSendSizes[ind];
+    toSend[ind].resize(toSendSizes[ind], -1);
+
+    ret = MPI_Irecv(toSend[ind].data(), toSend[ind].size(), MPI_INT, ind, 666 * 666, comm,
+                    reqs2.data() + cnt++);
+    ASSERT0(ret == MPI_SUCCESS);
+  }
+  for (size_t proc = 0; proc < toGet.size(); ++proc) {
+    if (!toGet[proc].empty()) {
+      const auto ret = MPI_Send(toGet[proc].data(), toGet[proc].size(), MPI_INT, proc,
+                                666 * 666, comm);
+      ASSERT0(ret == MPI_SUCCESS);
+    }
+  }
+  for (int c = 0; c < cnt; c++) {
+    int ind{0};
+    const auto ret = MPI_Waitany(cnt, reqs2.data(), &ind, MPI_STATUS_IGNORE);
+    ASSERT0(ret == MPI_SUCCESS);
+    ASSERT3(ind != MPI_UNDEFINED);
+  }
+}
+
+std::vector<BoutReal> GlobalField3DAccess::communicate_data(const Field3D& f) {
+  // Ensure setup is called, to setup communication pattern
+  if (not is_setup) {
+    setup();
+  }
+  // We now send the previosly requested data to every processor, that wanted some.
+  // We also get the data we requested.
+  ASSERT2(f.getMesh() == mesh);
+  std::vector<BoutReal> data(getOffsets.back());
+  std::vector<BoutReal> sendBuffer(sendBufferSize);
+  std::vector<MPI_Request> reqs(toSend.size());
+  int cnt1 = 0;
+  for (size_t proc = 0; proc < toGet.size(); ++proc) {
+    if (toGet[proc].empty()) {
+      continue;
+    }
+    auto ret = MPI_Irecv(data.data() + getOffsets[proc], toGet[proc].size(), MPI_DOUBLE,
+                         proc, 666, comm, reqs.data() + cnt1);
+    ASSERT0(ret == MPI_SUCCESS);
+    cnt1++;
+  }
+  int cnt = 0;
+  for (size_t proc = 0; proc < toGet.size(); ++proc) {
+    if (toSend[proc].empty()) {
+      continue;
+    }
+    const void* start = sendBuffer.data() + cnt;
+    for (auto i : toSend[proc]) {
+      sendBuffer[cnt++] = f[Ind3D(i)];
+    }
+    auto ret = MPI_Send(start, toSend[proc].size(), MPI_DOUBLE, proc, 666, comm);
+    ASSERT0(ret == MPI_SUCCESS);
+  }
+  for (int j = 0; j < cnt1; ++j) {
+    int ind{0};
+    auto ret = MPI_Waitany(cnt1, reqs.data(), &ind, MPI_STATUS_IGNORE);
+    ASSERT0(ret == MPI_SUCCESS);
+    ASSERT3(ind != MPI_UNDEFINED);
+    ASSERT3(ind >= 0);
+    ASSERT3(ind < cnt1);
+  }
+  return data;
+}
diff --git a/src/mesh/parallel/fci_comm.hxx b/src/mesh/parallel/fci_comm.hxx
new file mode 100644
index 0000000000..324cae8a22
--- /dev/null
+++ b/src/mesh/parallel/fci_comm.hxx
@@ -0,0 +1,187 @@
+/**************************************************************************
+ * Communication for Flux-coordinate Independent interpolation
+ *
+ **************************************************************************
+ * Copyright 2025 BOUT++ contributors
+ *
+ * Contact: Ben Dudson, dudson2@llnl.gov
+ *
+ * This file is part of BOUT++.
+ *
+ * BOUT++ is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * BOUT++ is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with BOUT++.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **************************************************************************/
+
+#pragma once
+
+#include "bout/assert.hxx"
+#include "bout/bout_types.hxx"
+#include "bout/boutcomm.hxx"
+#include "bout/field3d.hxx"
+#include "bout/mesh.hxx"
+#include "bout/region.hxx"
+#include <map>
+#include <memory>
+#include <set>
+#include <stddef.h>
+#include <utility>
+#include <vector>
+
+/// GlobalField3DAccess is a class to set up the communication
+/// patterns, to request abitrary data form the global
+/// field. GlobalField3DAccessInstance is an instance. after
+/// GlobalField3DAccess has communicated the pre-requested data. Only
+/// data that has been pre-requested can be requested from
+/// GlobalField3DAccessInstance after communication.
+///
+/// The usage looks a bit like this:
+///
+/// GlobalField3DAccess gfa;
+/// // Request an abitrary number of global indices ``gi``:
+/// gfa.request(gi)
+/// // Communicate data
+/// const auto data = gfa.communicate(f3d);
+/// // Now data can be accesssed for all previously requested ``gi``s:
+/// data[gi]
+class GlobalField3DAccess;
+
+namespace fci_comm {
+struct ProcLocal {
+  int proc;
+  int index;
+};
+
+/// Class to convert global to local indices for 1D
+/// given the global index, it returns the local index and the processor.
+struct GlobalToLocal1D {
+  GlobalToLocal1D(int mg, int npe, int localwith, bool periodic)
+      : mg(mg), npe(npe), localwith(localwith), local(localwith - (2 * mg)),
+        global(local * npe), globalwith(global + (2 * mg)), periodic(periodic){};
+  ProcLocal convert(int id) const;
+  int getLocalWith() const { return localwith; }
+  int getGlobalWith() const { return globalwith; }
+  int getNPE() const { return npe; }
+
+private:
+  int mg;
+  int npe;
+  int localwith;
+  int local;
+  int global;
+  int globalwith;
+  bool periodic;
+};
+
+/// Convert an x-y-z tupple to an Ind
+template <class ind>
+struct XYZ2Ind {
+  XYZ2Ind(const int nx, const int ny, const int nz) : nx(nx), ny(ny), nz(nz) {}
+  ind convert(const int x, const int y, const int z) const {
+    return {z + ((y + x * ny) * nz), ny, nz};
+  }
+  ind operator()(const int x, const int y, const int z) const { return convert(x, y, z); }
+
+private:
+  int nx;
+  int ny;
+  int nz;
+};
+} // namespace fci_comm
+
+class GlobalField3DAccessInstance {
+public:
+  const BoutReal& operator[](IndG3D ind) const;
+  GlobalField3DAccessInstance(const GlobalField3DAccess* gfa,
+                              std::vector<BoutReal>&& data)
+      : gfa(gfa), data(std::move(data)){};
+
+private:
+  const GlobalField3DAccess* gfa;
+  std::vector<BoutReal> data;
+};
+
+class GlobalField3DAccess {
+public:
+  friend class GlobalField3DAccessInstance;
+  GlobalField3DAccess(Mesh* mesh)
+      : mesh(mesh),
+        global2local_x(mesh->xstart, mesh->getNXPE(), mesh->LocalNx, mesh->periodicX),
+        global2local_y(mesh->ystart, mesh->getNYPE(), mesh->LocalNy, true),
+        global2local_z(mesh->zstart, mesh->getNZPE(), mesh->LocalNz, true),
+        xyzlocal(global2local_x.getLocalWith(), global2local_y.getLocalWith(),
+                 global2local_z.getLocalWith()),
+        xyzglobal(global2local_x.getGlobalWith(), global2local_y.getGlobalWith(),
+                  global2local_z.getGlobalWith()),
+        comm(BoutComm::get()) {
+#ifdef _OPENMP
+    openmp_ids.resize(omp_get_max_threads());
+#endif
+#if CHECK >= 2
+    // We could also allow false, but then we would need to ensure it
+    // is false everywhere.
+    for (int x = 0; x < mesh->LocalNx; ++x) {
+      ASSERT2(mesh->periodicY(x) == true);
+    }
+#endif
+  };
+  void request(IndG3D ind) {
+    ASSERT2(is_setup == false);
+#ifdef _OPENMP
+    ASSERT2(openmp_ids.size() > static_cast<size_t>(omp_get_thread_num()));
+    openmp_ids[omp_get_thread_num()].emplace(ind.ind);
+#else
+    ids.emplace(ind.ind);
+#endif
+  }
+
+  GlobalField3DAccessInstance communicate(const Field3D& f) {
+    return {this, communicate_data(f)};
+  }
+  std::unique_ptr<GlobalField3DAccessInstance> communicate_asPtr(const Field3D& f) {
+    return std::make_unique<GlobalField3DAccessInstance>(this, communicate_data(f));
+  }
+
+private:
+  void setup();
+  void commCommLists();
+  Mesh* mesh;
+#ifdef _OPENMP
+  // openmp thread-local variable
+  std::vector<std::set<int>> openmp_ids;
+#endif
+  std::set<int> ids;
+  std::map<int, int> mapping;
+  bool is_setup{false};
+  fci_comm::GlobalToLocal1D global2local_x;
+  fci_comm::GlobalToLocal1D global2local_y;
+  fci_comm::GlobalToLocal1D global2local_z;
+
+public:
+  fci_comm::XYZ2Ind<Ind3D> xyzlocal;
+  fci_comm::XYZ2Ind<IndG3D> xyzglobal;
+
+private:
+  std::vector<std::vector<int>> toGet;
+  std::vector<std::vector<int>> toSend;
+  std::vector<int> getOffsets;
+  int sendBufferSize{0};
+  MPI_Comm comm;
+  std::vector<BoutReal> communicate_data(const Field3D& f);
+};
+
+inline const BoutReal& GlobalField3DAccessInstance::operator[](IndG3D ind) const {
+  auto it = gfa->mapping.find(ind.ind);
+  ASSERT2(it != gfa->mapping.end());
+  return data[it->second];
+}
diff --git a/tests/MMS/spatial/fci/runtest b/tests/MMS/spatial/fci/runtest
index 34340e53f4..c76ad27833 100755
--- a/tests/MMS/spatial/fci/runtest
+++ b/tests/MMS/spatial/fci/runtest
@@ -17,7 +17,6 @@ import zoidberg as zb
 from boutdata.collect import collect
 from boututils.run_wrapper import build_and_log, launch_safe
 from numpy import arange, array, linspace, log, polyfit
-from scipy.interpolate import RectBivariateSpline as RBS
 
 # Global parameters
 DIRECTORY = "data"
@@ -32,15 +31,16 @@ NLIST = [8, 16, 32, 64]
 dx = 1.0 / array(NLIST)
 
 
-def myRBS(a, b, c):
-    """RectBivariateSpline, but automatically tune spline degree for small arrays"""
-    mx, _ = c.shape
-    kx = max(mx - 1, 1)
-    kx = min(kx, 3)
-    return RBS(a, b, c, kx=kx)
+success = True
 
+error_2 = {}
+error_inf = {}
+method_orders = {}
 
-zb.poloidal_grid.RectBivariateSpline = myRBS
+# Run with periodic Y?
+yperiodic = True
+
+build_and_log("FCI MMS test")
 
 
 def quiet_collect(name: str) -> float:
diff --git a/tests/integrated/test-fci-mpi/runtest b/tests/integrated/test-fci-mpi/runtest
index c18ab0391d..9962567217 100755
--- a/tests/integrated/test-fci-mpi/runtest
+++ b/tests/integrated/test-fci-mpi/runtest
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Python script to run and analyse MMS test
+# Python script to run and analyse MPI test
 
 from boututils.run_wrapper import build_and_log, launch_safe
 from boutdata.collect import collect
@@ -14,13 +14,13 @@ NLIST = [1, 2, 4]
 MAXCORES = 8
 NSLICES = [1]
 
-build_and_log("FCI MMS test")
+build_and_log("FCI MPI test")
 
 COLLECT_KW = dict(info=False, xguards=False, yguards=False, path="data")
 
 
 def run_case(nxpe: int, nype: int, mthread: int):
-    cmd = f"./fci_mpi NXPE={nxpe} NYPE={nype}"
+    cmd = f"./fci_mpi NXPE={nxpe} NYPE={nype} mesh:paralleltransform:xzinterpolation:type={implementation}"
     print(f"Running command: {cmd}")
 
     _, out = launch_safe(cmd, nproc=nxpe * nype, mthread=mthread, pipe=True)
@@ -46,28 +46,29 @@ def test_case(nxpe: int, nype: int, mthread: int, ref: dict) -> bool:
 
 failures = []
 
-for nslice in NSLICES:
-    # reference data!
-    run_case(1, 1, MAXCORES)
-
-    ref = {}
-    for i in range(4):
-        for yp in range(1, nslice + 1):
-            for y in [-yp, yp]:
-                name = f"output_{i}_{y:+d}"
-                ref[name] = collect(name, **COLLECT_KW)
-
-    for nxpe, nype in itertools.product(NLIST, NLIST):
-        if (nxpe, nype) == (1, 1):
-            # reference case, done above
-            continue
-
-        if nxpe * nype > MAXCORES:
-            continue
-
-        mthread = MAXCORES // (nxpe * nype)
-        failures_ = test_case(nxpe, nype, mthread, ref)
-        failures.extend(failures_)
+for implementation in ["hermitespline", "monotonichermitespline"]:
+    for nslice in NSLICES:
+        # reference data!
+        run_case(1, 1, MAXCORES)
+
+        ref = {}
+        for i in range(4):
+            for yp in range(1, nslice + 1):
+                for y in [-yp, yp]:
+                    name = f"output_{i}_{y:+d}"
+                    ref[name] = collect(name, **COLLECT_KW)
+
+        for nxpe, nype in itertools.product(NLIST, NLIST):
+            if (nxpe, nype) == (1, 1):
+                # reference case, done above
+                continue
+
+            if nxpe * nype > MAXCORES:
+                continue
+
+            mthread = MAXCORES // (nxpe * nype)
+            failures_ = test_case(nxpe, nype, mthread, ref)
+            failures.extend(failures_)
 
 
 success = len(failures) == 0
diff --git a/tests/integrated/test-interpolate/runtest b/tests/integrated/test-interpolate/runtest
index f5460aff2a..91503f5f9e 100755
--- a/tests/integrated/test-interpolate/runtest
+++ b/tests/integrated/test-interpolate/runtest
@@ -23,6 +23,8 @@ labels = [r"$" + var + r"$" for var in varlist]
 
 methods = {
     "hermitespline": 3,
+    "hermitesplinelegacy": 3,
+    "hermitesplineserial": 3,
     "lagrange4pt": 3,
     "bilinear": 2,
 }
diff --git a/tests/unit/fake_mesh.hxx b/tests/unit/fake_mesh.hxx
index 4656ee0282..cb7509ebde 100644
--- a/tests/unit/fake_mesh.hxx
+++ b/tests/unit/fake_mesh.hxx
@@ -113,9 +113,13 @@ public:
   int getNXPE() const override { return 1; }
   int getNYPE() const override { return 1; }
   int getNZPE() const override { return 1; }
-  int getXProcIndex() const override { return 1; }
-  int getYProcIndex() const override { return 1; }
-  int getZProcIndex() const override { return 1; }
+  int getXProcIndex() const override { return 0; }
+  int getYProcIndex() const override { return 0; }
+  int getZProcIndex() const override { return 0; }
+  int getProcIndex([[maybe_unused]] int X, [[maybe_unused]] int Y,
+                   [[maybe_unused]] int Z) const override {
+    return 0;
+  }
   bool firstX() const override { return true; }
   bool lastX() const override { return true; }
   int sendXOut(BoutReal* UNUSED(buffer), int UNUSED(size), int UNUSED(tag)) override {