From 88986969227dba4bba8f9065e36bb01115f8b71f Mon Sep 17 00:00:00 2001
From: Oliver Hennigh <oliverhennigh@Olivers-MacBook-Air.local>
Date: Wed, 6 May 2026 12:06:21 -0700
Subject: [PATCH] Add finite-volume Euler functionals example

---
 .gitignore                                    |    1 +
 CHANGELOG.md                                  |    2 +
 examples/README.md                            |    6 +
 .../functionals/finite_volume_euler/README.md |   29 +
 .../finite_volume_euler/conf/config.yaml      |   64 +
 .../euler_finite_volume.py                    | 1602 +++++++++++++++++
 .../finite_volume_euler/euler_solver.py       |  429 +++++
 .../finite_volume_euler/requirements.txt      |    6 +
 .../finite_volume/test_euler_example.py       |  146 ++
 9 files changed, 2285 insertions(+)
 create mode 100644 examples/functionals/finite_volume_euler/README.md
 create mode 100644 examples/functionals/finite_volume_euler/conf/config.yaml
 create mode 100644 examples/functionals/finite_volume_euler/euler_finite_volume.py
 create mode 100644 examples/functionals/finite_volume_euler/euler_solver.py
 create mode 100644 examples/functionals/finite_volume_euler/requirements.txt
 create mode 100644 test/nn/functional/finite_volume/test_euler_example.py

diff --git a/.gitignore b/.gitignore
index 7ec721b0e1..de7545c755 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,6 +172,7 @@ output/
 
 # Hydra
 outputs/
+outputs_finite_volume_euler/
 multirun/
 .hydra/
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f19f4718da..063098af30 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,6 +47,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   `uniform_grid_gradient`, `rectilinear_grid_gradient`,
   `spectral_grid_gradient`, `meshless_fd_derivatives`, `mesh_lsq_gradient`,
   and `mesh_green_gauss_gradient`.
+- Added a functionals example for mesh-based finite-volume Euler forward-step
+  solves with 2D triangular and 3D tetrahedral cases.
 - Adds `physicsnemo.sym` module for symbolic PDE residual computation
   (`PhysicsInformer`). Users define PDEs via SymPy and select a gradient method
   (`autodiff`, `finite_difference`, `spectral`, `meshless_finite_difference`,
diff --git a/examples/README.md b/examples/README.md
index a48b9adff1..17c1d41e7e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -50,6 +50,12 @@ The several examples inside PhysicsNeMo can be classified based on their domains
 |[Fine-tuning of DoMINO NIM](./cfd/external_aerodynamics/domino_nim_finetuning/)|DoMINO|NO|
 |[Transolver for External Aerodynamics on Irregular Meshes](./cfd/external_aerodynamics/transolver/)|Transolver|NO|
 
+### Functionals
+
+|Use case|Functional area|Transient|
+| --- | --- |  --- |
+|[Finite-Volume Euler Forward Step](./functionals/finite_volume_euler/)|Mesh finite-volume solver|YES|
+
 
 ### Weather
 
diff --git a/examples/functionals/finite_volume_euler/README.md b/examples/functionals/finite_volume_euler/README.md
new file mode 100644
index 0000000000..08dbfe7f04
--- /dev/null
+++ b/examples/functionals/finite_volume_euler/README.md
@@ -0,0 +1,29 @@
+# Finite-Volume Euler Example
+
+This example runs a second-order MUSCL-Hancock finite-volume solve of the
+compressible Euler equations on unstructured simplicial meshes. It includes a
+2D triangular forward-facing-step case and a 3D extruded tetrahedral version of
+the same geometry.
+
+The example uses `physicsnemo.mesh.Mesh` for the static simplicial mesh and the
+existing `mesh_green_gauss_gradient` functional for cell-centered gradient
+reconstruction. Euler-specific pieces, including primitive/conservative
+conversion, boundary states, CFL selection, and the fused MUSCL-Hancock
+limiter/reconstruction/Rusanov update are kept in `euler_finite_volume.py`.
+
+The timestep runtime keeps only the Mesh geometry, cell-neighbor ids, and local
+boundary tags. Face centroids, normals, areas, cell volumes, and limiter
+stencils are computed from that compact cell-local connectivity inside the
+operators.
+
+Run with:
+
+```bash
+python euler_solver.py
+```
+
+The default config writes density plots and final tensors under Hydra's
+`outputs_finite_volume_euler/` run directory. It also writes PNG frames and
+animated GIFs when `plot_every` and `make_animation` are enabled. Set
+`solver.implementation` to `torch` or `warp` to choose the backend for the same
+Euler finite-volume algorithm.
diff --git a/examples/functionals/finite_volume_euler/conf/config.yaml b/examples/functionals/finite_volume_euler/conf/config.yaml
new file mode 100644
index 0000000000..ce09d61799
--- /dev/null
+++ b/examples/functionals/finite_volume_euler/conf/config.yaml
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+hydra:
+  job:
+    chdir: True
+  run:
+    dir: ./outputs_finite_volume_euler
+
+solver:
+  device: cpu
+  implementation: torch
+  gamma: 1.4
+  cfl: 0.35
+  inflow_mach: 3.0
+  inflow_density: 1.4
+  inflow_pressure: 1.0
+  density_floor: 1.0e-6
+  pressure_floor: 1.0e-6
+  log_every: 10
+
+cases:
+  - name: forward_step_2d
+    dimension: 2
+    length: 3.0
+    height: 1.0
+    step_x: 0.6
+    step_height: 0.2
+    nx: 100
+    ny: 40
+    steps: 80
+    plot_every: 4
+    make_animation: true
+    density_vmin: 0.6
+    density_vmax: 6.5
+
+  - name: forward_step_3d
+    dimension: 3
+    length: 3.0
+    height: 1.0
+    depth: 0.25
+    step_x: 0.6
+    step_height: 0.2
+    nx: 50
+    ny: 20
+    nz: 4
+    steps: 40
+    plot_every: 4
+    make_animation: true
+    density_vmin: 0.6
+    density_vmax: 6.5
diff --git a/examples/functionals/finite_volume_euler/euler_finite_volume.py b/examples/functionals/finite_volume_euler/euler_finite_volume.py
new file mode 100644
index 0000000000..516e0a24f1
--- /dev/null
+++ b/examples/functionals/finite_volume_euler/euler_finite_volume.py
@@ -0,0 +1,1602 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example-local Euler finite-volume operators.
+
+These operators intentionally live in the example instead of
+``physicsnemo.nn.functional`` because primitive conversion, Euler boundary
+conditions, MUSCL-Hancock prediction, Rusanov flux evaluation, and conserved
+state updates are specific to compressible Euler. The public functions below
+have both Torch and Warp backends so the solver loop can stay as one clear
+algorithm.
+"""
+
+from __future__ import annotations
+
+import torch
+
+from physicsnemo.core.function_spec import FunctionSpec
+
+try:
+    import warp as wp
+
+    wp.init()
+    wp.config.quiet = True
+    _WARP_AVAILABLE = True
+except ImportError:
+    wp = None
+    _WARP_AVAILABLE = False
+
+
+INTERIOR = 0
+WALL_SLIP = 1
+INFLOW_SUPERSONIC = 2
+OUTFLOW_ZERO_GRAD = 3
+
+
+def _require_warp() -> None:
+    if not _WARP_AVAILABLE:
+        raise ImportError("The fused Euler solver requires the 'warp' package")
+
+
+def _launch_context(reference: torch.Tensor):
+    return FunctionSpec.warp_launch_context(reference)
+
+
+def primitive_to_conservative_torch(W: torch.Tensor, gamma: float) -> torch.Tensor:
+    """Convert primitive variables [rho, velocity..., pressure] to conserved form."""
+    rho = W[:, 0]
+    velocity = W[:, 1:-1]
+    pressure = W[:, -1]
+    kinetic = 0.5 * rho * (velocity * velocity).sum(dim=1)
+    energy = pressure / (gamma - 1.0) + kinetic
+    return torch.cat((rho[:, None], rho[:, None] * velocity, energy[:, None]), dim=1)
+
+
+def euler_conservative_to_primitive_torch(
+    U: torch.Tensor,
+    gamma: float,
+    density_floor: float,
+    pressure_floor: float,
+) -> torch.Tensor:
+    """Convert conserved variables [rho, rho*velocity..., energy] to primitives."""
+    rho = U[:, 0].clamp_min(density_floor)
+    momentum = U[:, 1:-1]
+    velocity = momentum / rho[:, None]
+    energy = U[:, -1]
+    kinetic = 0.5 * rho * (velocity * velocity).sum(dim=1)
+    pressure = ((gamma - 1.0) * (energy - kinetic)).clamp_min(pressure_floor)
+    return torch.cat((rho[:, None], velocity, pressure[:, None]), dim=1)
+
+
+def _apply_euler_boundary_state_torch(
+    left: torch.Tensor,
+    normals: torch.Tensor,
+    tags: torch.Tensor,
+    inflow_state: torch.Tensor,
+) -> torch.Tensor:
+    right = left.clone()
+    dims = normals.shape[1]
+
+    inflow = tags == INFLOW_SUPERSONIC
+    if bool(torch.any(inflow).item()):
+        right[inflow] = inflow_state.to(device=left.device, dtype=left.dtype)
+
+    wall = tags == WALL_SLIP
+    if bool(torch.any(wall).item()):
+        velocity = left[wall, 1 : 1 + dims]
+        normal = normals[wall]
+        normal_velocity = (velocity * normal).sum(dim=1, keepdim=True)
+        right[wall, 1 : 1 + dims] = velocity - 2.0 * normal_velocity * normal
+
+    return right
+
+
+def _euler_flux_normal_torch(
+    W: torch.Tensor, normals: torch.Tensor, gamma: float
+) -> torch.Tensor:
+    rho = W[:, 0]
+    velocity = W[:, 1:-1]
+    pressure = W[:, -1]
+    normal_velocity = (velocity * normals).sum(dim=1)
+    U = primitive_to_conservative_torch(W, gamma)
+
+    flux = torch.empty_like(U)
+    flux[:, 0] = rho * normal_velocity
+    flux[:, 1:-1] = (
+        rho[:, None] * velocity * normal_velocity[:, None] + pressure[:, None] * normals
+    )
+    flux[:, -1] = (U[:, -1] + pressure) * normal_velocity
+    return flux
+
+
+def _rusanov_euler_flux_torch(
+    W_left: torch.Tensor,
+    W_right: torch.Tensor,
+    normals: torch.Tensor,
+    gamma: float,
+    density_floor: float,
+    pressure_floor: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    W_left = W_left.clone()
+    W_right = W_right.clone()
+    W_left[:, 0] = W_left[:, 0].clamp_min(density_floor)
+    W_right[:, 0] = W_right[:, 0].clamp_min(density_floor)
+    W_left[:, -1] = W_left[:, -1].clamp_min(pressure_floor)
+    W_right[:, -1] = W_right[:, -1].clamp_min(pressure_floor)
+
+    U_left = primitive_to_conservative_torch(W_left, gamma)
+    U_right = primitive_to_conservative_torch(W_right, gamma)
+    flux_left = _euler_flux_normal_torch(W_left, normals, gamma)
+    flux_right = _euler_flux_normal_torch(W_right, normals, gamma)
+
+    velocity_left = W_left[:, 1:-1]
+    velocity_right = W_right[:, 1:-1]
+    sound_left = torch.sqrt(gamma * W_left[:, -1] / W_left[:, 0])
+    sound_right = torch.sqrt(gamma * W_right[:, -1] / W_right[:, 0])
+    un_left = (velocity_left * normals).sum(dim=1).abs()
+    un_right = (velocity_right * normals).sum(dim=1).abs()
+    wavespeed = torch.maximum(un_left + sound_left, un_right + sound_right)
+    flux = 0.5 * (flux_left + flux_right) - 0.5 * wavespeed[:, None] * (
+        U_right - U_left
+    )
+    return flux, wavespeed
+
+
+def _euler_primitive_time_derivative_torch(
+    W: torch.Tensor,
+    grad_W: torch.Tensor,
+    gamma: float,
+) -> torch.Tensor:
+    rho = W[:, 0]
+    velocity = W[:, 1:-1]
+    pressure = W[:, -1]
+    grad_rho = grad_W[:, :, 0]
+    grad_velocity = grad_W[:, :, 1:-1]
+    grad_pressure = grad_W[:, :, -1]
+
+    def advect(grad: torch.Tensor) -> torch.Tensor:
+        return (velocity * grad).sum(dim=1)
+
+    div_velocity = torch.diagonal(grad_velocity, dim1=1, dim2=2).sum(dim=1)
+
+    dW = torch.empty_like(W)
+    dW[:, 0] = -(advect(grad_rho) + rho * div_velocity)
+    for dim in range(velocity.shape[1]):
+        dW[:, 1 + dim] = -(
+            advect(grad_velocity[:, :, dim]) + grad_pressure[:, dim] / rho
+        )
+    dW[:, -1] = -(advect(grad_pressure) + gamma * pressure * div_velocity)
+    return dW
+
+
+def _cell_geometry_torch(
+    points: torch.Tensor,
+    cells: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Compute simplex centroids and volumes from connectivity."""
+    cell_points = points[cells.to(dtype=torch.int64)]
+    centroids = cell_points.mean(dim=1)
+    if points.shape[1] == 2:
+        p0, p1, p2 = cell_points[:, 0], cell_points[:, 1], cell_points[:, 2]
+        volumes = 0.5 * torch.abs(
+            (p1[:, 0] - p0[:, 0]) * (p2[:, 1] - p0[:, 1])
+            - (p1[:, 1] - p0[:, 1]) * (p2[:, 0] - p0[:, 0])
+        )
+    else:
+        p0, p1, p2, p3 = (
+            cell_points[:, 0],
+            cell_points[:, 1],
+            cell_points[:, 2],
+            cell_points[:, 3],
+        )
+        volumes = (
+            torch.abs(
+                torch.einsum("bi,bi->b", p1 - p0, torch.cross(p2 - p0, p3 - p0, dim=-1))
+            )
+            / 6.0
+        )
+    return centroids, volumes.clamp_min(1.0e-12)
+
+
+def _local_face_geometry_torch(
+    points: torch.Tensor,
+    cells: torch.Tensor,
+    cell_centroids: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Compute local face centroids and outward area vectors for each cell."""
+    cell_points = points[cells.to(dtype=torch.int64)]
+    dims = points.shape[1]
+    n_faces = cells.shape[1]
+    face_centroids = []
+    area_vectors = []
+    for face_idx in range(n_faces):
+        face_local = [idx for idx in range(n_faces) if idx != face_idx]
+        verts = cell_points[:, face_local]
+        face_center = verts.mean(dim=1)
+        if dims == 2:
+            edge = verts[:, 1] - verts[:, 0]
+            area_vector = torch.stack((edge[:, 1], -edge[:, 0]), dim=1)
+        else:
+            area_vector = 0.5 * torch.cross(
+                verts[:, 1] - verts[:, 0], verts[:, 2] - verts[:, 0], dim=-1
+            )
+        to_face = face_center - cell_centroids
+        flip = (area_vector * to_face).sum(dim=1) < 0.0
+        area_vector = torch.where(flip[:, None], -area_vector, area_vector)
+        face_centroids.append(face_center)
+        area_vectors.append(area_vector)
+
+    face_centroids_t = torch.stack(face_centroids, dim=1)
+    area_vectors_t = torch.stack(area_vectors, dim=1)
+    face_areas_t = area_vectors_t.norm(dim=2)
+    return face_centroids_t, area_vectors_t, face_areas_t
+
+
+def _euler_barth_jespersen_phi_torch(
+    cell_values: torch.Tensor,
+    cell_gradients: torch.Tensor,
+    cell_centroids: torch.Tensor,
+    face_centroids: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+    eps: float = 1.0e-12,
+) -> torch.Tensor:
+    """Compute componentwise Barth-Jespersen limiter coefficients."""
+    n_components = cell_values.shape[1]
+    mins = cell_values.clone()
+    maxs = cell_values.clone()
+
+    neighbors_i64 = cell_neighbors.to(dtype=torch.int64)
+    for face_idx in range(cell_neighbors.shape[1]):
+        neighbors = neighbors_i64[:, face_idx]
+        interior = neighbors >= 0
+        if bool(torch.any(interior).item()):
+            active_neighbors = neighbors[interior]
+            neighbor_values = cell_values[active_neighbors]
+            mins[interior] = torch.minimum(mins[interior], neighbor_values)
+            maxs[interior] = torch.maximum(maxs[interior], neighbor_values)
+
+    phi = torch.ones(
+        (cell_values.shape[0], n_components),
+        device=cell_values.device,
+        dtype=cell_values.dtype,
+    )
+    for face_idx in range(cell_neighbors.shape[1]):
+        dx = face_centroids[:, face_idx] - cell_centroids
+        delta = torch.einsum("cd,cdk->ck", dx, cell_gradients)
+        values = cell_values
+        ratio = torch.ones_like(delta)
+        positive = delta > eps
+        negative = delta < -eps
+        ratio[positive] = (maxs - values)[positive] / delta[positive]
+        ratio[negative] = (mins - values)[negative] / delta[negative]
+        ratio = ratio.clamp(0.0, 1.0)
+        phi = torch.minimum(phi, ratio)
+    return phi
+
+
+def _linear_reconstruct_to_local_faces_torch(
+    cell_values: torch.Tensor,
+    cell_gradients: torch.Tensor,
+    cell_centroids: torch.Tensor,
+    face_centroids: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    dx = face_centroids - cell_centroids[:, None, :]
+    left = cell_values[:, None, :] + torch.einsum("cfd,cdk->cfk", dx, cell_gradients)
+
+    right_cells = cell_neighbors.to(dtype=torch.int64).clamp_min(0)
+    right = cell_values[right_cells].clone()
+    interior = cell_neighbors >= 0
+    if bool(torch.any(interior).item()):
+        neighbor_dx = face_centroids - cell_centroids[right_cells]
+        right[interior] = (
+            right[interior]
+            + torch.einsum("cfd,cfdk->cfk", neighbor_dx, cell_gradients[right_cells])[
+                interior
+            ]
+        )
+    return left, right
+
+
+def euler_cfl_timestep_torch(
+    cell_values: torch.Tensor,
+    points: torch.Tensor,
+    cells: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+    boundary_tags: torch.Tensor,
+    inflow_state: torch.Tensor,
+    gamma: float,
+    cfl: float,
+    density_floor: float,
+    pressure_floor: float,
+    eps: float = 1.0e-12,
+) -> torch.Tensor:
+    """Compute the global explicit Euler CFL timestep."""
+    centroids, volumes = _cell_geometry_torch(points, cells)
+    _, area_vectors, face_areas = _local_face_geometry_torch(points, cells, centroids)
+    normals = area_vectors / face_areas[..., None].clamp_min(eps)
+    right_cells = cell_neighbors.to(dtype=torch.int64).clamp_min(0)
+    spectral = torch.zeros_like(volumes)
+    for face_idx in range(cells.shape[1]):
+        W_left = cell_values
+        W_right = cell_values[right_cells[:, face_idx]].clone()
+        boundary = cell_neighbors[:, face_idx] < 0
+        if bool(torch.any(boundary).item()):
+            W_right[boundary] = _apply_euler_boundary_state_torch(
+                W_left[boundary],
+                normals[:, face_idx][boundary],
+                boundary_tags[:, face_idx][boundary],
+                inflow_state,
+            )
+        _, wavespeeds = _rusanov_euler_flux_torch(
+            W_left,
+            W_right,
+            normals[:, face_idx],
+            gamma,
+            density_floor,
+            pressure_floor,
+        )
+        spectral = spectral + face_areas[:, face_idx] * wavespeeds.clamp_min(0.0)
+    return float(cfl) * torch.min(volumes / spectral.clamp_min(float(eps)))
+
+
+def euler_update_torch(
+    U: torch.Tensor,
+    cell_values: torch.Tensor,
+    cell_gradients: torch.Tensor,
+    points: torch.Tensor,
+    cells: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+    boundary_tags: torch.Tensor,
+    inflow_state: torch.Tensor,
+    dt: torch.Tensor,
+    gamma: float,
+    density_floor: float,
+    pressure_floor: float,
+) -> torch.Tensor:
+    """Torch backend for the same MUSCL-Hancock Euler update as the Warp kernel."""
+    cell_centroids, cell_volumes = _cell_geometry_torch(points, cells)
+    face_centroids, face_area_vectors, face_areas = _local_face_geometry_torch(
+        points, cells, cell_centroids
+    )
+    phi = _euler_barth_jespersen_phi_torch(
+        cell_values,
+        cell_gradients,
+        cell_centroids,
+        face_centroids,
+        cell_neighbors,
+    )
+    limited_gradients = cell_gradients * phi[:, None, :]
+    W_half = cell_values + 0.5 * dt * _euler_primitive_time_derivative_torch(
+        cell_values, limited_gradients, gamma
+    )
+    W_half[:, 0] = W_half[:, 0].clamp_min(density_floor)
+    W_half[:, -1] = W_half[:, -1].clamp_min(pressure_floor)
+
+    W_left, W_right = _linear_reconstruct_to_local_faces_torch(
+        W_half,
+        limited_gradients,
+        cell_centroids,
+        face_centroids,
+        cell_neighbors,
+    )
+    W_left[..., 0] = W_left[..., 0].clamp_min(density_floor)
+    W_left[..., -1] = W_left[..., -1].clamp_min(pressure_floor)
+    W_right[..., 0] = W_right[..., 0].clamp_min(density_floor)
+    W_right[..., -1] = W_right[..., -1].clamp_min(pressure_floor)
+
+    normals = face_area_vectors / face_areas[..., None].clamp_min(1.0e-12)
+    boundary = cell_neighbors < 0
+    if bool(torch.any(boundary).item()):
+        W_right[boundary] = _apply_euler_boundary_state_torch(
+            W_left[boundary],
+            normals[boundary],
+            boundary_tags[boundary],
+            inflow_state,
+        )
+    flux, _ = _rusanov_euler_flux_torch(
+        W_left.reshape(-1, W_left.shape[-1]),
+        W_right.reshape(-1, W_right.shape[-1]),
+        normals.reshape(-1, normals.shape[-1]),
+        gamma,
+        density_floor,
+        pressure_floor,
+    )
+    flux = flux.reshape_as(W_left) * face_areas[..., None]
+
+    U_next = U.clone()
+    U_next = U_next - dt * flux.sum(dim=1) / cell_volumes[:, None]
+    return U_next
+
+
+if _WARP_AVAILABLE:
+
+    @wp.kernel
+    def _conservative_to_primitive_kernel(
+        U: wp.array2d(dtype=wp.float32),
+        gamma: float,
+        density_floor: float,
+        pressure_floor: float,
+        dims: int,
+        W: wp.array2d(dtype=wp.float32),
+    ):
+        cell = wp.tid()
+        rho = wp.max(U[cell, 0], density_floor)
+        inv_rho = 1.0 / rho
+        speed2 = float(0.0)
+        for d in range(dims):
+            velocity = U[cell, 1 + d] * inv_rho
+            W[cell, 1 + d] = velocity
+            speed2 = speed2 + velocity * velocity
+        pressure = (gamma - 1.0) * (U[cell, dims + 1] - 0.5 * rho * speed2)
+
+        W[cell, 0] = rho
+        W[cell, dims + 1] = wp.max(pressure, pressure_floor)
+
+    @wp.func
+    def _select_phi(
+        comp: int,
+        phi_0: float,
+        phi_1: float,
+        phi_2: float,
+        phi_3: float,
+        phi_4: float,
+    ):
+        if comp == 0:
+            return phi_0
+        elif comp == 1:
+            return phi_1
+        elif comp == 2:
+            return phi_2
+        elif comp == 3:
+            return phi_3
+        return phi_4
+
+    @wp.func
+    def _cell_centroid_vec(
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        cell: int,
+        dims: int,
+    ):
+        centroid = wp.vec3(0.0, 0.0, 0.0)
+        inv_count = 1.0 / float(dims + 1)
+        for local_vertex in range(4):
+            if local_vertex < dims + 1:
+                point = cells[cell, local_vertex]
+                centroid[0] = centroid[0] + points[point, 0] * inv_count
+                centroid[1] = centroid[1] + points[point, 1] * inv_count
+                if dims == 3:
+                    centroid[2] = centroid[2] + points[point, 2] * inv_count
+        return centroid
+
+    @wp.func
+    def _cell_volume(
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        cell: int,
+        dims: int,
+    ):
+        p0 = cells[cell, 0]
+        p1 = cells[cell, 1]
+        p2 = cells[cell, 2]
+        if dims == 2:
+            x10 = points[p1, 0] - points[p0, 0]
+            y10 = points[p1, 1] - points[p0, 1]
+            x20 = points[p2, 0] - points[p0, 0]
+            y20 = points[p2, 1] - points[p0, 1]
+            return wp.max(0.5 * wp.abs(x10 * y20 - y10 * x20), 1.0e-30)
+
+        p3 = cells[cell, 3]
+        a = wp.vec3(
+            points[p1, 0] - points[p0, 0],
+            points[p1, 1] - points[p0, 1],
+            points[p1, 2] - points[p0, 2],
+        )
+        b = wp.vec3(
+            points[p2, 0] - points[p0, 0],
+            points[p2, 1] - points[p0, 1],
+            points[p2, 2] - points[p0, 2],
+        )
+        c = wp.vec3(
+            points[p3, 0] - points[p0, 0],
+            points[p3, 1] - points[p0, 1],
+            points[p3, 2] - points[p0, 2],
+        )
+        return wp.max(wp.abs(wp.dot(a, wp.cross(b, c))) / 6.0, 1.0e-30)
+
+    @wp.func
+    def _local_face_centroid_vec(
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        cell: int,
+        local_face: int,
+        dims: int,
+    ):
+        centroid = wp.vec3(0.0, 0.0, 0.0)
+        inv_count = 1.0 / float(dims)
+        for local_vertex in range(4):
+            if local_vertex < dims + 1 and local_vertex != local_face:
+                point = cells[cell, local_vertex]
+                centroid[0] = centroid[0] + points[point, 0] * inv_count
+                centroid[1] = centroid[1] + points[point, 1] * inv_count
+                if dims == 3:
+                    centroid[2] = centroid[2] + points[point, 2] * inv_count
+        return centroid
+
+    @wp.func
+    def _local_face_area_vector(
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        cell: int,
+        local_face: int,
+        cell_centroid: wp.vec3,
+        face_centroid: wp.vec3,
+        dims: int,
+    ):
+        a = int(-1)
+        b = int(-1)
+        c = int(-1)
+        count = int(0)
+        for local_vertex in range(4):
+            if local_vertex < dims + 1 and local_vertex != local_face:
+                point = cells[cell, local_vertex]
+                if count == 0:
+                    a = point
+                elif count == 1:
+                    b = point
+                else:
+                    c = point
+                count = count + 1
+
+        if dims == 2:
+            edge_x = points[b, 0] - points[a, 0]
+            edge_y = points[b, 1] - points[a, 1]
+            area_vector = wp.vec3(edge_y, -edge_x, 0.0)
+        else:
+            ab = wp.vec3(
+                points[b, 0] - points[a, 0],
+                points[b, 1] - points[a, 1],
+                points[b, 2] - points[a, 2],
+            )
+            ac = wp.vec3(
+                points[c, 0] - points[a, 0],
+                points[c, 1] - points[a, 1],
+                points[c, 2] - points[a, 2],
+            )
+            area_vector = 0.5 * wp.cross(ab, ac)
+
+        if wp.dot(area_vector, face_centroid - cell_centroid) < 0.0:
+            area_vector = -area_vector
+        return area_vector
+
+    @wp.func
+    def _cell_limiter_phi(
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        values: wp.array2d(dtype=wp.float32),
+        gradients: wp.array3d(dtype=wp.float32),
+        neighbors: wp.array2d(dtype=wp.int32),
+        cell: int,
+        comp: int,
+        dims: int,
+        n_cell_faces: int,
+        eps: float,
+    ):
+        value = values[cell, comp]
+        value_min = value
+        value_max = value
+        for local_face in range(n_cell_faces):
+            neighbor = neighbors[cell, local_face]
+            if neighbor >= 0:
+                neighbor_value = values[neighbor, comp]
+                value_min = wp.min(value_min, neighbor_value)
+                value_max = wp.max(value_max, neighbor_value)
+
+        cell_centroid = _cell_centroid_vec(points, cells, cell, dims)
+        phi = float(1.0)
+        for local_face in range(n_cell_faces):
+            face_centroid = _local_face_centroid_vec(
+                points, cells, cell, local_face, dims
+            )
+            delta = float(0.0)
+            for d in range(dims):
+                dx = face_centroid[d] - cell_centroid[d]
+                delta = delta + dx * gradients[cell, d, comp]
+
+            ratio = float(1.0)
+            if delta > eps:
+                ratio = (value_max - value) / delta
+            elif delta < -eps:
+                ratio = (value_min - value) / delta
+            phi = wp.min(phi, wp.min(wp.max(ratio, 0.0), 1.0))
+        return phi
+
+    @wp.func
+    def _limited_grad(
+        gradients: wp.array3d(dtype=wp.float32),
+        cell: int,
+        dim: int,
+        comp: int,
+        phi_0: float,
+        phi_1: float,
+        phi_2: float,
+        phi_3: float,
+        phi_4: float,
+    ):
+        return gradients[cell, dim, comp] * _select_phi(
+            comp, phi_0, phi_1, phi_2, phi_3, phi_4
+        )
+
+    @wp.func
+    def _predicted_value(
+        values: wp.array2d(dtype=wp.float32),
+        gradients: wp.array3d(dtype=wp.float32),
+        cell: int,
+        comp: int,
+        dt: float,
+        gamma: float,
+        dims: int,
+        phi_0: float,
+        phi_1: float,
+        phi_2: float,
+        phi_3: float,
+        phi_4: float,
+    ):
+        rho = wp.max(values[cell, 0], 1.0e-30)
+        pressure = values[cell, dims + 1]
+
+        advected = float(0.0)
+        div_velocity = float(0.0)
+        for d in range(dims):
+            velocity_d = values[cell, 1 + d]
+            advected = advected + velocity_d * _limited_grad(
+                gradients, cell, d, comp, phi_0, phi_1, phi_2, phi_3, phi_4
+            )
+            div_velocity = div_velocity + _limited_grad(
+                gradients, cell, d, 1 + d, phi_0, phi_1, phi_2, phi_3, phi_4
+            )
+
+        derivative = -advected
+        if comp == 0:
+            derivative = derivative - rho * div_velocity
+        elif comp == dims + 1:
+            derivative = derivative - gamma * pressure * div_velocity
+        else:
+            velocity_dim = comp - 1
+            derivative = (
+                derivative
+                - _limited_grad(
+                    gradients,
+                    cell,
+                    velocity_dim,
+                    dims + 1,
+                    phi_0,
+                    phi_1,
+                    phi_2,
+                    phi_3,
+                    phi_4,
+                )
+                / rho
+            )
+
+        return values[cell, comp] + 0.5 * dt * derivative
+
+    @wp.func
+    def _reconstruct_predicted(
+        values: wp.array2d(dtype=wp.float32),
+        gradients: wp.array3d(dtype=wp.float32),
+        cell_centroid: wp.vec3,
+        face_centroid: wp.vec3,
+        cell: int,
+        comp: int,
+        dt: float,
+        gamma: float,
+        dims: int,
+        phi_0: float,
+        phi_1: float,
+        phi_2: float,
+        phi_3: float,
+        phi_4: float,
+    ):
+        value = _predicted_value(
+            values,
+            gradients,
+            cell,
+            comp,
+            dt,
+            gamma,
+            dims,
+            phi_0,
+            phi_1,
+            phi_2,
+            phi_3,
+            phi_4,
+        )
+        for d in range(dims):
+            dx = face_centroid[d] - cell_centroid[d]
+            value = value + dx * _limited_grad(
+                gradients, cell, d, comp, phi_0, phi_1, phi_2, phi_3, phi_4
+            )
+        return value
+
+    @wp.func
+    def _primitive_energy(
+        rho: float,
+        u: float,
+        v: float,
+        w: float,
+        pressure: float,
+        gamma: float,
+        dims: int,
+    ):
+        speed2 = u * u + v * v
+        if dims == 3:
+            speed2 = speed2 + w * w
+        return pressure / (gamma - 1.0) + 0.5 * rho * speed2
+
+    @wp.func
+    def _normal_flux_component(
+        comp: int,
+        rho: float,
+        u: float,
+        v: float,
+        w: float,
+        pressure: float,
+        energy: float,
+        un: float,
+        nx: float,
+        ny: float,
+        nz: float,
+        dims: int,
+    ):
+        if comp == 0:
+            return rho * un
+        elif comp == 1:
+            return rho * u * un + pressure * nx
+        elif comp == 2:
+            return rho * v * un + pressure * ny
+        elif comp == 3 and dims == 3:
+            return rho * w * un + pressure * nz
+        else:
+            return (energy + pressure) * un
+
+    @wp.func
+    def _conservative_component(
+        comp: int,
+        rho: float,
+        u: float,
+        v: float,
+        w: float,
+        energy: float,
+        dims: int,
+    ):
+        if comp == 0:
+            return rho
+        elif comp == 1:
+            return rho * u
+        elif comp == 2:
+            return rho * v
+        elif comp == 3 and dims == 3:
+            return rho * w
+        else:
+            return energy
+
+    @wp.kernel
+    def _euler_cfl_cell_kernel(
+        values: wp.array2d(dtype=wp.float32),
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        neighbors: wp.array2d(dtype=wp.int32),
+        boundary_tags: wp.array2d(dtype=wp.int32),
+        inflow: wp.array(dtype=wp.float32),
+        gamma: float,
+        density_floor: float,
+        pressure_floor: float,
+        dims: int,
+        spectral: wp.array(dtype=wp.float32),
+    ):
+        cell = wp.tid()
+        n_cell_faces = dims + 1
+        cell_centroid = _cell_centroid_vec(points, cells, cell, dims)
+        spectral_sum = float(0.0)
+
+        rho_l = wp.max(values[cell, 0], density_floor)
+        u_l = values[cell, 1]
+        v_l = values[cell, 2]
+        w_l = float(0.0)
+        if dims == 3:
+            w_l = values[cell, 3]
+        p_l = wp.max(values[cell, dims + 1], pressure_floor)
+
+        for local_face in range(4):
+            if local_face < n_cell_faces:
+                neighbor = neighbors[cell, local_face]
+                face_centroid = _local_face_centroid_vec(
+                    points, cells, cell, local_face, dims
+                )
+                area_vector = _local_face_area_vector(
+                    points, cells, cell, local_face, cell_centroid, face_centroid, dims
+                )
+                area = wp.max(wp.length(area_vector), 1.0e-30)
+                nx = area_vector[0] / area
+                ny = area_vector[1] / area
+                nz = float(0.0)
+                if dims == 3:
+                    nz = area_vector[2] / area
+
+                rho_r = rho_l
+                u_r = u_l
+                v_r = v_l
+                w_r = w_l
+                p_r = p_l
+                if neighbor >= 0:
+                    rho_r = wp.max(values[neighbor, 0], density_floor)
+                    u_r = values[neighbor, 1]
+                    v_r = values[neighbor, 2]
+                    if dims == 3:
+                        w_r = values[neighbor, 3]
+                    p_r = wp.max(values[neighbor, dims + 1], pressure_floor)
+                elif boundary_tags[cell, local_face] == INFLOW_SUPERSONIC:
+                    rho_r = wp.max(inflow[0], density_floor)
+                    u_r = inflow[1]
+                    v_r = inflow[2]
+                    if dims == 3:
+                        w_r = inflow[3]
+                    p_r = wp.max(inflow[dims + 1], pressure_floor)
+                elif boundary_tags[cell, local_face] == WALL_SLIP:
+                    un_l = u_l * nx + v_l * ny + w_l * nz
+                    u_r = u_l - 2.0 * un_l * nx
+                    v_r = v_l - 2.0 * un_l * ny
+                    if dims == 3:
+                        w_r = w_l - 2.0 * un_l * nz
+
+                un_l_abs = wp.abs(u_l * nx + v_l * ny + w_l * nz)
+                un_r_abs = wp.abs(u_r * nx + v_r * ny + w_r * nz)
+                c_l = wp.sqrt(gamma * p_l / rho_l)
+                c_r = wp.sqrt(gamma * p_r / rho_r)
+                spectral_sum = spectral_sum + area * wp.max(
+                    un_l_abs + c_l, un_r_abs + c_r
+                )
+
+        spectral[cell] = spectral_sum
+
+    @wp.func
+    def _euler_update_cell(
+        cell: int,
+        U_next: wp.array2d(dtype=wp.float32),
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        values: wp.array2d(dtype=wp.float32),
+        gradients: wp.array3d(dtype=wp.float32),
+        neighbors: wp.array2d(dtype=wp.int32),
+        boundary_tags: wp.array2d(dtype=wp.int32),
+        inflow: wp.array(dtype=wp.float32),
+        dt: float,
+        gamma: float,
+        density_floor: float,
+        pressure_floor: float,
+        dims: int,
+        n_cell_faces: int,
+    ):
+        cell_centroid = _cell_centroid_vec(points, cells, cell, dims)
+        cell_volume = _cell_volume(points, cells, cell, dims)
+        phi_0 = _cell_limiter_phi(
+            points,
+            cells,
+            values,
+            gradients,
+            neighbors,
+            cell,
+            0,
+            dims,
+            n_cell_faces,
+            1.0e-12,
+        )
+        phi_1 = _cell_limiter_phi(
+            points,
+            cells,
+            values,
+            gradients,
+            neighbors,
+            cell,
+            1,
+            dims,
+            n_cell_faces,
+            1.0e-12,
+        )
+        phi_2 = _cell_limiter_phi(
+            points,
+            cells,
+            values,
+            gradients,
+            neighbors,
+            cell,
+            2,
+            dims,
+            n_cell_faces,
+            1.0e-12,
+        )
+        phi_3 = _cell_limiter_phi(
+            points,
+            cells,
+            values,
+            gradients,
+            neighbors,
+            cell,
+            3,
+            dims,
+            n_cell_faces,
+            1.0e-12,
+        )
+        phi_4 = float(1.0)
+        if dims == 3:
+            phi_4 = _cell_limiter_phi(
+                points,
+                cells,
+                values,
+                gradients,
+                neighbors,
+                cell,
+                4,
+                dims,
+                n_cell_faces,
+                1.0e-12,
+            )
+
+        for local_face in range(n_cell_faces):
+            other = neighbors[cell, local_face]
+            face_centroid = _local_face_centroid_vec(
+                points, cells, cell, local_face, dims
+            )
+            area_vector = _local_face_area_vector(
+                points, cells, cell, local_face, cell_centroid, face_centroid, dims
+            )
+            area = wp.max(wp.length(area_vector), 1.0e-30)
+            nx = area_vector[0] / area
+            ny = area_vector[1] / area
+            nz = float(0.0)
+            if dims == 3:
+                nz = area_vector[2] / area
+
+            rho_l = _reconstruct_predicted(
+                values,
+                gradients,
+                cell_centroid,
+                face_centroid,
+                cell,
+                0,
+                dt,
+                gamma,
+                dims,
+                phi_0,
+                phi_1,
+                phi_2,
+                phi_3,
+                phi_4,
+            )
+            u_l = _reconstruct_predicted(
+                values,
+                gradients,
+                cell_centroid,
+                face_centroid,
+                cell,
+                1,
+                dt,
+                gamma,
+                dims,
+                phi_0,
+                phi_1,
+                phi_2,
+                phi_3,
+                phi_4,
+            )
+            v_l = _reconstruct_predicted(
+                values,
+                gradients,
+                cell_centroid,
+                face_centroid,
+                cell,
+                2,
+                dt,
+                gamma,
+                dims,
+                phi_0,
+                phi_1,
+                phi_2,
+                phi_3,
+                phi_4,
+            )
+            w_l = float(0.0)
+            if dims == 3:
+                w_l = _reconstruct_predicted(
+                    values,
+                    gradients,
+                    cell_centroid,
+                    face_centroid,
+                    cell,
+                    3,
+                    dt,
+                    gamma,
+                    dims,
+                    phi_0,
+                    phi_1,
+                    phi_2,
+                    phi_3,
+                    phi_4,
+                )
+            p_l = _reconstruct_predicted(
+                values,
+                gradients,
+                cell_centroid,
+                face_centroid,
+                cell,
+                dims + 1,
+                dt,
+                gamma,
+                dims,
+                phi_0,
+                phi_1,
+                phi_2,
+                phi_3,
+                phi_4,
+            )
+            rho_l = wp.max(rho_l, density_floor)
+            p_l = wp.max(p_l, pressure_floor)
+
+            rho_r = rho_l
+            u_r = u_l
+            v_r = v_l
+            w_r = w_l
+            p_r = p_l
+            if other >= 0:
+                other_centroid = _cell_centroid_vec(points, cells, other, dims)
+                other_phi_0 = _cell_limiter_phi(
+                    points,
+                    cells,
+                    values,
+                    gradients,
+                    neighbors,
+                    other,
+                    0,
+                    dims,
+                    n_cell_faces,
+                    1.0e-12,
+                )
+                other_phi_1 = _cell_limiter_phi(
+                    points,
+                    cells,
+                    values,
+                    gradients,
+                    neighbors,
+                    other,
+                    1,
+                    dims,
+                    n_cell_faces,
+                    1.0e-12,
+                )
+                other_phi_2 = _cell_limiter_phi(
+                    points,
+                    cells,
+                    values,
+                    gradients,
+                    neighbors,
+                    other,
+                    2,
+                    dims,
+                    n_cell_faces,
+                    1.0e-12,
+                )
+                other_phi_3 = _cell_limiter_phi(
+                    points,
+                    cells,
+                    values,
+                    gradients,
+                    neighbors,
+                    other,
+                    3,
+                    dims,
+                    n_cell_faces,
+                    1.0e-12,
+                )
+                other_phi_4 = float(1.0)
+                if dims == 3:
+                    other_phi_4 = _cell_limiter_phi(
+                        points,
+                        cells,
+                        values,
+                        gradients,
+                        neighbors,
+                        other,
+                        4,
+                        dims,
+                        n_cell_faces,
+                        1.0e-12,
+                    )
+                rho_r = _reconstruct_predicted(
+                    values,
+                    gradients,
+                    other_centroid,
+                    face_centroid,
+                    other,
+                    0,
+                    dt,
+                    gamma,
+                    dims,
+                    other_phi_0,
+                    other_phi_1,
+                    other_phi_2,
+                    other_phi_3,
+                    other_phi_4,
+                )
+                u_r = _reconstruct_predicted(
+                    values,
+                    gradients,
+                    other_centroid,
+                    face_centroid,
+                    other,
+                    1,
+                    dt,
+                    gamma,
+                    dims,
+                    other_phi_0,
+                    other_phi_1,
+                    other_phi_2,
+                    other_phi_3,
+                    other_phi_4,
+                )
+                v_r = _reconstruct_predicted(
+                    values,
+                    gradients,
+                    other_centroid,
+                    face_centroid,
+                    other,
+                    2,
+                    dt,
+                    gamma,
+                    dims,
+                    other_phi_0,
+                    other_phi_1,
+                    other_phi_2,
+                    other_phi_3,
+                    other_phi_4,
+                )
+                if dims == 3:
+                    w_r = _reconstruct_predicted(
+                        values,
+                        gradients,
+                        other_centroid,
+                        face_centroid,
+                        other,
+                        3,
+                        dt,
+                        gamma,
+                        dims,
+                        other_phi_0,
+                        other_phi_1,
+                        other_phi_2,
+                        other_phi_3,
+                        other_phi_4,
+                    )
+                p_r = _reconstruct_predicted(
+                    values,
+                    gradients,
+                    other_centroid,
+                    face_centroid,
+                    other,
+                    dims + 1,
+                    dt,
+                    gamma,
+                    dims,
+                    other_phi_0,
+                    other_phi_1,
+                    other_phi_2,
+                    other_phi_3,
+                    other_phi_4,
+                )
+                rho_r = wp.max(rho_r, density_floor)
+                p_r = wp.max(p_r, pressure_floor)
+            elif boundary_tags[cell, local_face] == INFLOW_SUPERSONIC:
+                rho_r = wp.max(inflow[0], density_floor)
+                u_r = inflow[1]
+                v_r = inflow[2]
+                if dims == 3:
+                    w_r = inflow[3]
+                p_r = wp.max(inflow[dims + 1], pressure_floor)
+            elif boundary_tags[cell, local_face] == WALL_SLIP:
+                un_l_boundary = u_l * nx + v_l * ny + w_l * nz
+                u_r = u_l - 2.0 * un_l_boundary * nx
+                v_r = v_l - 2.0 * un_l_boundary * ny
+                if dims == 3:
+                    w_r = w_l - 2.0 * un_l_boundary * nz
+
+            energy_l = _primitive_energy(rho_l, u_l, v_l, w_l, p_l, gamma, dims)
+            energy_r = _primitive_energy(rho_r, u_r, v_r, w_r, p_r, gamma, dims)
+            un_l = u_l * nx + v_l * ny + w_l * nz
+            un_r = u_r * nx + v_r * ny + w_r * nz
+            c_l = wp.sqrt(gamma * p_l / rho_l)
+            c_r = wp.sqrt(gamma * p_r / rho_r)
+            speed = wp.max(wp.abs(un_l) + c_l, wp.abs(un_r) + c_r)
+
+            for comp in range(5):
+                if comp < dims + 2:
+                    flux_l = _normal_flux_component(
+                        comp,
+                        rho_l,
+                        u_l,
+                        v_l,
+                        w_l,
+                        p_l,
+                        energy_l,
+                        un_l,
+                        nx,
+                        ny,
+                        nz,
+                        dims,
+                    )
+                    flux_r = _normal_flux_component(
+                        comp,
+                        rho_r,
+                        u_r,
+                        v_r,
+                        w_r,
+                        p_r,
+                        energy_r,
+                        un_r,
+                        nx,
+                        ny,
+                        nz,
+                        dims,
+                    )
+                    conserved_l = _conservative_component(
+                        comp, rho_l, u_l, v_l, w_l, energy_l, dims
+                    )
+                    conserved_r = _conservative_component(
+                        comp, rho_r, u_r, v_r, w_r, energy_r, dims
+                    )
+                    flux = area * (
+                        0.5 * (flux_l + flux_r)
+                        - 0.5 * speed * (conserved_r - conserved_l)
+                    )
+                    U_next[cell, comp] = U_next[cell, comp] - dt * flux / cell_volume
+
+    @wp.kernel
+    def _euler_update_2d_cell_kernel(
+        U_next: wp.array2d(dtype=wp.float32),
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        values: wp.array2d(dtype=wp.float32),
+        gradients: wp.array3d(dtype=wp.float32),
+        neighbors: wp.array2d(dtype=wp.int32),
+        boundary_tags: wp.array2d(dtype=wp.int32),
+        inflow: wp.array(dtype=wp.float32),
+        dt: float,
+        gamma: float,
+        density_floor: float,
+        pressure_floor: float,
+        n_cell_faces: int,
+    ):
+        _euler_update_cell(
+            wp.tid(),
+            U_next,
+            points,
+            cells,
+            values,
+            gradients,
+            neighbors,
+            boundary_tags,
+            inflow,
+            dt,
+            gamma,
+            density_floor,
+            pressure_floor,
+            2,
+            n_cell_faces,
+        )
+
+    @wp.kernel
+    def _euler_update_3d_cell_kernel(
+        U_next: wp.array2d(dtype=wp.float32),
+        points: wp.array2d(dtype=wp.float32),
+        cells: wp.array2d(dtype=wp.int32),
+        values: wp.array2d(dtype=wp.float32),
+        gradients: wp.array3d(dtype=wp.float32),
+        neighbors: wp.array2d(dtype=wp.int32),
+        boundary_tags: wp.array2d(dtype=wp.int32),
+        inflow: wp.array(dtype=wp.float32),
+        dt: float,
+        gamma: float,
+        density_floor: float,
+        pressure_floor: float,
+        n_cell_faces: int,
+    ):
+        _euler_update_cell(
+            wp.tid(),
+            U_next,
+            points,
+            cells,
+            values,
+            gradients,
+            neighbors,
+            boundary_tags,
+            inflow,
+            dt,
+            gamma,
+            density_floor,
+            pressure_floor,
+            3,
+            n_cell_faces,
+        )
+
+
+def euler_conservative_to_primitive_warp(
+    U: torch.Tensor,
+    gamma: float,
+    density_floor: float,
+    pressure_floor: float,
+) -> torch.Tensor:
+    """Convert Euler conserved variables to primitives with one Warp cell kernel."""
+    _require_warp()
+    dims = U.shape[1] - 2
+    U_fp32 = U.to(dtype=torch.float32).contiguous()
+    W = torch.empty_like(U_fp32)
+    wp_device, wp_stream = _launch_context(U_fp32)
+    with wp.ScopedStream(wp_stream):
+        wp.launch(
+            _conservative_to_primitive_kernel,
+            dim=U.shape[0],
+            inputs=[
+                wp.from_torch(U_fp32, dtype=wp.float32),
+                float(gamma),
+                float(density_floor),
+                float(pressure_floor),
+                int(dims),
+                wp.from_torch(W, dtype=wp.float32),
+            ],
+            device=wp_device,
+            stream=wp_stream,
+        )
+    return W.to(dtype=U.dtype)
+
+
+def euler_cfl_timestep_warp(
+    cell_values: torch.Tensor,
+    points: torch.Tensor,
+    cells: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+    boundary_tags: torch.Tensor,
+    inflow_state: torch.Tensor,
+    gamma: float,
+    cfl: float,
+    density_floor: float,
+    pressure_floor: float,
+    eps: float = 1.0e-12,
+) -> torch.Tensor:
+    """Compute the explicit Euler CFL timestep without face-state tensors."""
+    _require_warp()
+    dims = cell_values.shape[1] - 2
+    n_cells = cell_values.shape[0]
+
+    values_fp32 = cell_values.to(dtype=torch.float32).contiguous()
+    points_fp32 = points.to(dtype=torch.float32).contiguous()
+    cells_i32 = cells.to(dtype=torch.int32).contiguous()
+    neighbors_i32 = cell_neighbors.to(dtype=torch.int32).contiguous()
+    tags_i32 = boundary_tags.to(dtype=torch.int32).contiguous()
+    inflow_fp32 = inflow_state.to(
+        device=cell_values.device, dtype=torch.float32
+    ).contiguous()
+    spectral = torch.zeros((n_cells,), device=cell_values.device, dtype=torch.float32)
+
+    wp_device, wp_stream = _launch_context(values_fp32)
+    with wp.ScopedStream(wp_stream):
+        wp.launch(
+            _euler_cfl_cell_kernel,
+            dim=n_cells,
+            inputs=[
+                wp.from_torch(values_fp32, dtype=wp.float32),
+                wp.from_torch(points_fp32, dtype=wp.float32),
+                wp.from_torch(cells_i32, dtype=wp.int32),
+                wp.from_torch(neighbors_i32, dtype=wp.int32),
+                wp.from_torch(tags_i32, dtype=wp.int32),
+                wp.from_torch(inflow_fp32, dtype=wp.float32),
+                float(gamma),
+                float(density_floor),
+                float(pressure_floor),
+                int(dims),
+                wp.from_torch(spectral, dtype=wp.float32),
+            ],
+            device=wp_device,
+            stream=wp_stream,
+        )
+
+    _, volumes = _cell_geometry_torch(points_fp32, cells_i32)
+    dt = float(cfl) * torch.min(volumes / spectral.clamp_min(float(eps)))
+    return dt.to(dtype=cell_values.dtype)
+
+
+def euler_update_warp(
+    U: torch.Tensor,
+    cell_values: torch.Tensor,
+    cell_gradients: torch.Tensor,
+    points: torch.Tensor,
+    cells: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+    boundary_tags: torch.Tensor,
+    inflow_state: torch.Tensor,
+    dt: torch.Tensor,
+    gamma: float,
+    density_floor: float,
+    pressure_floor: float,
+) -> torch.Tensor:
+    """Fuse MUSCL-Hancock reconstruction, boundary states, fluxes, and update."""
+    _require_warp()
+    dims = U.shape[1] - 2
+    if dims not in (2, 3):
+        raise ValueError("Euler update supports only 2D and 3D states")
+    n_cells = U.shape[0]
+    n_cell_faces = cells.shape[1]
+
+    U_next = U.to(dtype=torch.float32).contiguous().clone()
+    points_fp32 = points.to(dtype=torch.float32).contiguous()
+    cells_i32 = cells.to(dtype=torch.int32).contiguous()
+    values_fp32 = cell_values.to(dtype=torch.float32).contiguous()
+    gradients_fp32 = cell_gradients.to(dtype=torch.float32).contiguous()
+    neighbors_i32 = cell_neighbors.to(dtype=torch.int32).contiguous()
+    tags_i32 = boundary_tags.to(dtype=torch.int32).contiguous()
+    inflow_fp32 = inflow_state.to(device=U.device, dtype=torch.float32).contiguous()
+
+    wp_device, wp_stream = _launch_context(U_next)
+    update_kernel = (
+        _euler_update_2d_cell_kernel if dims == 2 else _euler_update_3d_cell_kernel
+    )
+    with wp.ScopedStream(wp_stream):
+        wp.launch(
+            update_kernel,
+            dim=n_cells,
+            inputs=[
+                wp.from_torch(U_next, dtype=wp.float32),
+                wp.from_torch(points_fp32, dtype=wp.float32),
+                wp.from_torch(cells_i32, dtype=wp.int32),
+                wp.from_torch(values_fp32, dtype=wp.float32),
+                wp.from_torch(gradients_fp32, dtype=wp.float32),
+                wp.from_torch(neighbors_i32, dtype=wp.int32),
+                wp.from_torch(tags_i32, dtype=wp.int32),
+                wp.from_torch(inflow_fp32, dtype=wp.float32),
+                float(dt),
+                float(gamma),
+                float(density_floor),
+                float(pressure_floor),
+                int(n_cell_faces),
+            ],
+            device=wp_device,
+            stream=wp_stream,
+        )
+    return U_next.to(dtype=U.dtype)
+
+
+def euler_conservative_to_primitive(
+    U: torch.Tensor,
+    gamma: float,
+    density_floor: float,
+    pressure_floor: float,
+    implementation: str,
+) -> torch.Tensor:
+    """Convert conserved variables to primitives using the selected backend."""
+    if implementation == "torch":
+        return euler_conservative_to_primitive_torch(
+            U, gamma, density_floor, pressure_floor
+        )
+    if implementation == "warp":
+        return euler_conservative_to_primitive_warp(
+            U, gamma, density_floor, pressure_floor
+        )
+    raise ValueError("solver.implementation must be 'torch' or 'warp'")
+
+
+def euler_cfl_timestep(
+    cell_values: torch.Tensor,
+    points: torch.Tensor,
+    cells: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+    boundary_tags: torch.Tensor,
+    inflow_state: torch.Tensor,
+    gamma: float,
+    cfl: float,
+    density_floor: float,
+    pressure_floor: float,
+    implementation: str,
+    eps: float = 1.0e-12,
+) -> torch.Tensor:
+    """Compute the explicit CFL timestep using the selected backend."""
+    if implementation == "torch":
+        return euler_cfl_timestep_torch(
+            cell_values,
+            points,
+            cells,
+            cell_neighbors,
+            boundary_tags,
+            inflow_state,
+            gamma,
+            cfl,
+            density_floor,
+            pressure_floor,
+            eps,
+        )
+    if implementation == "warp":
+        return euler_cfl_timestep_warp(
+            cell_values,
+            points,
+            cells,
+            cell_neighbors,
+            boundary_tags,
+            inflow_state,
+            gamma,
+            cfl,
+            density_floor,
+            pressure_floor,
+            eps,
+        )
+    raise ValueError("solver.implementation must be 'torch' or 'warp'")
+
+
+def euler_update(
+    U: torch.Tensor,
+    cell_values: torch.Tensor,
+    cell_gradients: torch.Tensor,
+    points: torch.Tensor,
+    cells: torch.Tensor,
+    cell_neighbors: torch.Tensor,
+    boundary_tags: torch.Tensor,
+    inflow_state: torch.Tensor,
+    dt: torch.Tensor,
+    gamma: float,
+    density_floor: float,
+    pressure_floor: float,
+    implementation: str,
+) -> torch.Tensor:
+    """Advance conserved variables by one Euler finite-volume step."""
+    if implementation == "torch":
+        return euler_update_torch(
+            U,
+            cell_values,
+            cell_gradients,
+            points,
+            cells,
+            cell_neighbors,
+            boundary_tags,
+            inflow_state,
+            dt,
+            gamma,
+            density_floor,
+            pressure_floor,
+        )
+    if implementation == "warp":
+        return euler_update_warp(
+            U,
+            cell_values,
+            cell_gradients,
+            points,
+            cells,
+            cell_neighbors,
+            boundary_tags,
+            inflow_state,
+            dt,
+            gamma,
+            density_floor,
+            pressure_floor,
+        )
+    raise ValueError("solver.implementation must be 'torch' or 'warp'")
diff --git a/examples/functionals/finite_volume_euler/euler_solver.py b/examples/functionals/finite_volume_euler/euler_solver.py
new file mode 100644
index 0000000000..6521f16ab4
--- /dev/null
+++ b/examples/functionals/finite_volume_euler/euler_solver.py
@@ -0,0 +1,429 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Second-order unstructured finite-volume Euler example.
+
+This example intentionally keeps the gas-dynamics pieces local to the example
+while using ``physicsnemo.mesh.Mesh`` for the static simplicial mesh. The time
+integrator is a MUSCL-Hancock scheme:
+
+1. reconstruct primitive-variable gradients on cells,
+2. compute a CFL timestep from Euler wave speeds,
+3. run an Euler update that applies Barth-Jespersen limiting, predicts
+   primitive variables by a half timestep, reconstructs face states, solves a
+   normal Rusanov problem, and updates conserved variables.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import hydra
+import numpy as np
+import torch
+from omegaconf import DictConfig
+
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    plt = None
+
+try:
+    import imageio.v2 as imageio
+except ImportError:
+    imageio = None
+
+from physicsnemo.mesh import Mesh
+from physicsnemo.nn.functional import (
+    mesh_green_gauss_gradient,
+)
+from physicsnemo.nn.functional.derivatives.mesh_green_gauss_gradient.utils import (
+    build_neighbors,
+)
+
+try:
+    from euler_finite_volume import (
+        euler_cfl_timestep,
+        euler_conservative_to_primitive,
+        euler_update,
+        primitive_to_conservative_torch,
+    )
+except ModuleNotFoundError:
+    from .euler_finite_volume import (
+        euler_cfl_timestep,
+        euler_conservative_to_primitive,
+        euler_update,
+        primitive_to_conservative_torch,
+    )
+
+INTERIOR = 0
+WALL_SLIP = 1
+INFLOW_SUPERSONIC = 2
+OUTFLOW_ZERO_GRAD = 3
+
+
+def make_step_mesh_2d(cfg, device: torch.device) -> Mesh:
+    """Structured triangular mesh with the forward-facing-step solid removed."""
+    nx, ny = int(cfg.nx), int(cfg.ny)
+    xs = torch.linspace(0.0, float(cfg.length), nx + 1, device=device)
+    ys = torch.linspace(0.0, float(cfg.height), ny + 1, device=device)
+    yy, xx = torch.meshgrid(ys, xs, indexing="ij")
+    points = torch.stack((xx.reshape(-1), yy.reshape(-1)), dim=1)
+
+    def pid(i: int, j: int) -> int:
+        return j * (nx + 1) + i
+
+    cells: list[tuple[int, int, int]] = []
+    for j in range(ny):
+        for i in range(nx):
+            cx = 0.5 * (xs[i] + xs[i + 1])
+            cy = 0.5 * (ys[j] + ys[j + 1])
+            if cx >= float(cfg.step_x) and cy <= float(cfg.step_height):
+                continue
+            p00, p10 = pid(i, j), pid(i + 1, j)
+            p01, p11 = pid(i, j + 1), pid(i + 1, j + 1)
+            cells.append((p00, p10, p11))
+            cells.append((p00, p11, p01))
+    return Mesh(
+        points=points, cells=torch.tensor(cells, dtype=torch.int64, device=device)
+    )
+
+
+def make_step_mesh_3d(cfg, device: torch.device) -> Mesh:
+    """Extruded tetrahedral forward-facing-step mesh."""
+    nx, ny, nz = int(cfg.nx), int(cfg.ny), int(cfg.nz)
+    xs = torch.linspace(0.0, float(cfg.length), nx + 1, device=device)
+    ys = torch.linspace(0.0, float(cfg.height), ny + 1, device=device)
+    zs = torch.linspace(0.0, float(cfg.depth), nz + 1, device=device)
+    zz, yy, xx = torch.meshgrid(zs, ys, xs, indexing="ij")
+    points = torch.stack((xx.reshape(-1), yy.reshape(-1), zz.reshape(-1)), dim=1)
+
+    def pid(i: int, j: int, k: int) -> int:
+        return k * (ny + 1) * (nx + 1) + j * (nx + 1) + i
+
+    cells: list[tuple[int, int, int, int]] = []
+    for k in range(nz):
+        for j in range(ny):
+            for i in range(nx):
+                cx = 0.5 * (xs[i] + xs[i + 1])
+                cy = 0.5 * (ys[j] + ys[j + 1])
+                if cx >= float(cfg.step_x) and cy <= float(cfg.step_height):
+                    continue
+                p000 = pid(i, j, k)
+                p100 = pid(i + 1, j, k)
+                p010 = pid(i, j + 1, k)
+                p110 = pid(i + 1, j + 1, k)
+                p001 = pid(i, j, k + 1)
+                p101 = pid(i + 1, j, k + 1)
+                p011 = pid(i, j + 1, k + 1)
+                p111 = pid(i + 1, j + 1, k + 1)
+                cells.extend(
+                    [
+                        (p000, p001, p011, p111),
+                        (p000, p011, p010, p111),
+                        (p000, p010, p110, p111),
+                        (p000, p110, p100, p111),
+                        (p000, p100, p101, p111),
+                        (p000, p101, p001, p111),
+                    ]
+                )
+    return Mesh(
+        points=points, cells=torch.tensor(cells, dtype=torch.int64, device=device)
+    )
+
+
+def local_face_centroids(mesh: Mesh) -> torch.Tensor:
+    """Compute centroids for each local simplex face opposite a local vertex."""
+    cell_points = mesh.points[mesh.cells.to(dtype=torch.int64)]
+    n_faces = mesh.cells.shape[1]
+    face_centroids = []
+    for face_idx in range(n_faces):
+        face_local = [idx for idx in range(n_faces) if idx != face_idx]
+        face_centroids.append(cell_points[:, face_local].mean(dim=1))
+    return torch.stack(face_centroids, dim=1)
+
+
+def tag_step_boundaries(
+    case_cfg,
+    mesh: Mesh,
+    neighbors: torch.Tensor,
+) -> torch.Tensor:
+    """Assign Euler boundary tags to local cell faces by face-centroid location."""
+    tags = torch.full_like(neighbors, INTERIOR)
+    boundary = neighbors < 0
+    x = local_face_centroids(mesh)[:, :, 0]
+    tol = 1.0e-6
+    tags[boundary] = WALL_SLIP
+    tags[boundary & (x <= tol)] = INFLOW_SUPERSONIC
+    tags[boundary & (x >= float(case_cfg.length) - tol)] = OUTFLOW_ZERO_GRAD
+    return tags
+
+
+def build_case(
+    case_cfg,
+    device: torch.device,
+) -> tuple[Mesh, torch.Tensor, torch.Tensor]:
+    """Build mesh connectivity, local cell adjacency, and boundary tags."""
+    if int(case_cfg.dimension) == 2:
+        mesh = make_step_mesh_2d(case_cfg, device)
+    elif int(case_cfg.dimension) == 3:
+        mesh = make_step_mesh_3d(case_cfg, device)
+    else:
+        raise ValueError("case.dimension must be 2 or 3")
+
+    neighbors = build_neighbors(mesh.cells).to(device=device)
+    boundary_tags = tag_step_boundaries(case_cfg, mesh, neighbors)
+    return mesh, neighbors, boundary_tags
+
+
+def initial_state(
+    solver_cfg,
+    mesh: Mesh,
+) -> torch.Tensor:
+    """Uniform supersonic inflow primitive state over all fluid cells."""
+    dims = mesh.n_spatial_dims
+    W = torch.zeros(
+        (mesh.n_cells, dims + 2),
+        device=mesh.points.device,
+        dtype=mesh.points.dtype,
+    )
+    W[:, 0] = float(solver_cfg.inflow_density)
+    W[:, 1] = (
+        float(solver_cfg.inflow_mach)
+        * (float(solver_cfg.gamma) * float(solver_cfg.inflow_pressure) / W[:, 0]) ** 0.5
+    )
+    W[:, -1] = float(solver_cfg.inflow_pressure)
+    return W
+
+
+def run_case(case_cfg, solver_cfg, output_dir: Path) -> None:
+    """Run one configured finite-volume Euler case."""
+    device = torch.device(str(solver_cfg.device))
+    mesh, cell_neighbors, boundary_tags = build_case(case_cfg, device)
+    gamma = float(solver_cfg.gamma)
+    density_floor = float(solver_cfg.density_floor)
+    pressure_floor = float(solver_cfg.pressure_floor)
+    implementation = str(solver_cfg.implementation)
+    W0_initial = initial_state(solver_cfg, mesh)
+    inflow = W0_initial[0]
+    U = primitive_to_conservative_torch(W0_initial, gamma)
+    frame_paths: list[Path] = []
+    frame_dir = output_dir / f"{case_cfg.name}_frames"
+    plot_every = int(getattr(case_cfg, "plot_every", 0))
+    time = 0.0
+    if plot_every > 0:
+        frame_dir.mkdir(parents=True, exist_ok=True)
+        W0 = euler_conservative_to_primitive(
+            U, gamma, density_floor, pressure_floor, implementation
+        )
+        path = frame_dir / "frame_00000.png"
+        plot_density(
+            case_cfg,
+            mesh,
+            W0,
+            path,
+            title=f"{case_cfg.name} step 0, t=0.000",
+        )
+        frame_paths.append(path)
+
+    for step in range(int(case_cfg.steps)):
+        W = euler_conservative_to_primitive(
+            U, gamma, density_floor, pressure_floor, implementation
+        )
+        dt = euler_cfl_timestep(
+            W,
+            mesh.points,
+            mesh.cells,
+            cell_neighbors,
+            boundary_tags,
+            inflow,
+            gamma,
+            float(solver_cfg.cfl),
+            density_floor,
+            pressure_floor,
+            implementation,
+        )
+        grad_W = mesh_green_gauss_gradient(
+            mesh.points,
+            mesh.cells,
+            cell_neighbors,
+            W,
+            implementation=implementation,
+        )
+        U = euler_update(
+            U,
+            W,
+            grad_W,
+            mesh.points,
+            mesh.cells,
+            cell_neighbors,
+            boundary_tags,
+            inflow,
+            dt,
+            gamma,
+            density_floor,
+            pressure_floor,
+            implementation,
+        )
+        time += float(dt)
+
+        if step % int(solver_cfg.log_every) == 0:
+            print(
+                f"{case_cfg.name}: step={step:04d} t={time:.4f} dt={float(dt):.4e} "
+                f"rho=[{float(W[:, 0].min()):.3f}, {float(W[:, 0].max()):.3f}]"
+            )
+        if plot_every > 0 and (step + 1) % plot_every == 0:
+            W_plot = euler_conservative_to_primitive(
+                U, gamma, density_floor, pressure_floor, implementation
+            )
+            path = frame_dir / f"frame_{step + 1:05d}.png"
+            plot_density(
+                case_cfg,
+                mesh,
+                W_plot,
+                path,
+                title=f"{case_cfg.name} step {step + 1}, t={time:.3f}",
+            )
+            frame_paths.append(path)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    W = euler_conservative_to_primitive(
+        U, gamma, density_floor, pressure_floor, implementation
+    )
+    torch.save(
+        {
+            "points": mesh.points.detach().cpu(),
+            "cells": mesh.cells.detach().cpu(),
+            "primitive": W.detach().cpu(),
+            "time": time,
+        },
+        output_dir / f"{case_cfg.name}.pt",
+    )
+    plot_density(
+        case_cfg,
+        mesh,
+        W,
+        output_dir / f"{case_cfg.name}_density.png",
+        title=f"{case_cfg.name} final, t={time:.3f}",
+    )
+    if bool(getattr(case_cfg, "make_animation", False)) and frame_paths:
+        write_animation(frame_paths, output_dir / f"{case_cfg.name}_density.gif")
+        write_animation(frame_paths, output_dir / f"{case_cfg.name}_density.mp4")
+
+
+def plot_density(
+    case_cfg,
+    mesh: Mesh,
+    W: torch.Tensor,
+    path: Path,
+    title: str | None = None,
+) -> None:
+    """Save a compact density plot for 2D and a mid-plane slice for 3D."""
+    if plt is None:
+        print("matplotlib is not installed; skipping density plot output")
+        return
+    centroids = mesh.cell_centroids.detach().cpu()
+    density = W[:, 0].detach().cpu()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    plt.figure(figsize=(9, 3.4))
+    if int(case_cfg.dimension) == 2:
+        mesh_points = mesh.points.detach().cpu()
+        coll = plt.tripcolor(
+            mesh_points[:, 0],
+            mesh_points[:, 1],
+            mesh.cells.detach().cpu()[:, :3],
+            facecolors=density,
+            shading="flat",
+            cmap="viridis",
+            vmin=float(getattr(case_cfg, "density_vmin", 0.6)),
+            vmax=float(getattr(case_cfg, "density_vmax", 6.5)),
+        )
+        plt.gca().set_aspect("equal")
+    else:
+        # The 3D mesh is an extruded tetrahedralization. Plotting raw tet
+        # centroids produces clustered dots because each hexahedral bin is
+        # split into six tetrahedra. Average the tetrahedral cell data back to
+        # the underlying x-y bins for a clean z-averaged cross-section.
+        field, x_edges, y_edges = _xy_bin_average(case_cfg, centroids, density)
+        field = np.ma.masked_invalid(field.numpy())
+        coll = plt.pcolormesh(
+            x_edges.numpy(),
+            y_edges.numpy(),
+            field,
+            shading="flat",
+            cmap="viridis",
+            vmin=float(getattr(case_cfg, "density_vmin", 0.6)),
+            vmax=float(getattr(case_cfg, "density_vmax", 6.5)),
+        )
+        plt.gca().set_aspect("equal")
+    plt.colorbar(coll, label="density")
+    if title is not None:
+        plt.title(title)
+    plt.xlabel("x")
+    plt.ylabel("y")
+    plt.tight_layout()
+    plt.savefig(path, dpi=180)
+    plt.close()
+
+
+def _xy_bin_average(case_cfg, centroids: torch.Tensor, values: torch.Tensor):
+    """Average 3D tetrahedral cell data into x-y bins for cross-section plots."""
+    nx = int(case_cfg.nx)
+    ny = int(case_cfg.ny)
+    length = float(case_cfg.length)
+    height = float(case_cfg.height)
+
+    x_edges = torch.linspace(0.0, length, nx + 1)
+    y_edges = torch.linspace(0.0, height, ny + 1)
+    x = centroids[:, 0].clamp(0.0, length)
+    y = centroids[:, 1].clamp(0.0, height)
+    ix = torch.clamp((x / length * nx).floor().to(torch.int64), 0, nx - 1)
+    iy = torch.clamp((y / height * ny).floor().to(torch.int64), 0, ny - 1)
+
+    flat = iy * nx + ix
+    accum = torch.zeros((ny * nx,), dtype=values.dtype)
+    counts = torch.zeros((ny * nx,), dtype=values.dtype)
+    accum.index_add_(0, flat, values)
+    counts.index_add_(0, flat, torch.ones_like(values))
+    field = accum / counts.clamp_min(1.0)
+    field[counts == 0] = float("nan")
+    return field.reshape(ny, nx), x_edges, y_edges
+
+
+def write_animation(frame_paths: list[Path], output_path: Path) -> None:
+    """Write an animation from saved PNG frames when imageio is installed."""
+    if imageio is None:
+        print("imageio is not installed; skipping animation output")
+        return
+    images = [imageio.imread(path) for path in frame_paths if path.exists()]
+    if images:
+        if output_path.suffix == ".mp4":
+            imageio.mimsave(output_path, images, fps=8)
+        else:
+            imageio.mimsave(output_path, images, duration=0.12)
+        print(f"wrote animation: {output_path}")
+
+
+@hydra.main(version_base="1.3", config_path="conf", config_name="config")
+def main(cfg: DictConfig) -> None:
+    """Run all finite-volume Euler cases from the Hydra config."""
+    output_dir = Path("outputs")
+    for case_cfg in cfg.cases:
+        run_case(case_cfg, cfg.solver, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/functionals/finite_volume_euler/requirements.txt b/examples/functionals/finite_volume_euler/requirements.txt
new file mode 100644
index 0000000000..896a4e36ff
--- /dev/null
+++ b/examples/functionals/finite_volume_euler/requirements.txt
@@ -0,0 +1,6 @@
+hydra-core>=1.3.0
+imageio
+imageio-ffmpeg
+matplotlib
+pillow
+torch
diff --git a/test/nn/functional/finite_volume/test_euler_example.py b/test/nn/functional/finite_volume/test_euler_example.py
new file mode 100644
index 0000000000..662c0288e6
--- /dev/null
+++ b/test/nn/functional/finite_volume/test_euler_example.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+from omegaconf import OmegaConf
+
+from physicsnemo.nn.functional import mesh_green_gauss_gradient
+
+EXAMPLE_DIR = (
+    Path(__file__).resolve().parents[4]
+    / "examples"
+    / "functionals"
+    / "finite_volume_euler"
+)
+sys.path.insert(0, str(EXAMPLE_DIR))
+
+from euler_finite_volume import (  # noqa: E402
+    euler_cfl_timestep,
+    euler_conservative_to_primitive,
+    euler_update,
+    primitive_to_conservative_torch,
+)
+from euler_solver import build_case, initial_state  # noqa: E402
+
+
+def _solver_cfg():
+    return OmegaConf.create(
+        {
+            "device": "cpu",
+            "gamma": 1.4,
+            "density_floor": 1.0e-6,
+            "pressure_floor": 1.0e-6,
+            "inflow_density": 1.4,
+            "inflow_pressure": 1.0,
+            "inflow_mach": 3.0,
+            "cfl": 0.25,
+        }
+    )
+
+
+@pytest.mark.parametrize(
+    "case_cfg",
+    [
+        OmegaConf.create(
+            {
+                "name": "tiny2d",
+                "dimension": 2,
+                "length": 3.0,
+                "height": 1.0,
+                "step_x": 0.6,
+                "step_height": 0.2,
+                "nx": 10,
+                "ny": 5,
+            }
+        ),
+        OmegaConf.create(
+            {
+                "name": "tiny3d",
+                "dimension": 3,
+                "length": 3.0,
+                "height": 1.0,
+                "depth": 0.4,
+                "step_x": 0.6,
+                "step_height": 0.2,
+                "nx": 5,
+                "ny": 5,
+                "nz": 2,
+            }
+        ),
+    ],
+)
+def test_euler_example_warp_matches_torch_one_step(case_cfg):
+    pytest.importorskip("warp")
+    solver_cfg = _solver_cfg()
+    mesh, cell_neighbors, boundary_tags = build_case(case_cfg, torch.device("cpu"))
+    gamma = float(solver_cfg.gamma)
+    density_floor = float(solver_cfg.density_floor)
+    pressure_floor = float(solver_cfg.pressure_floor)
+    U = primitive_to_conservative_torch(initial_state(solver_cfg, mesh), gamma)
+
+    outputs = []
+    dts = []
+    for implementation in ("torch", "warp"):
+        W = euler_conservative_to_primitive(
+            U, gamma, density_floor, pressure_floor, implementation
+        )
+        dt = euler_cfl_timestep(
+            W,
+            mesh.points,
+            mesh.cells,
+            cell_neighbors,
+            boundary_tags,
+            W[0],
+            gamma,
+            float(solver_cfg.cfl),
+            density_floor,
+            pressure_floor,
+            implementation,
+        )
+        grad_W = mesh_green_gauss_gradient(
+            mesh.points,
+            mesh.cells,
+            cell_neighbors,
+            W,
+            implementation=implementation,
+        )
+        outputs.append(
+            euler_update(
+                U,
+                W,
+                grad_W,
+                mesh.points,
+                mesh.cells,
+                cell_neighbors,
+                boundary_tags,
+                W[0],
+                dt,
+                gamma,
+                density_floor,
+                pressure_floor,
+                implementation,
+            )
+        )
+        dts.append(dt)
+
+    torch.testing.assert_close(dts[1], dts[0], rtol=0.0, atol=1.0e-7)
+    torch.testing.assert_close(outputs[1], outputs[0], rtol=1.0e-5, atol=1.0e-5)