diff --git a/examples/additive_manufacturing/sintering_physics/inference.py b/examples/additive_manufacturing/sintering_physics/inference.py index 2a870aa631..fbb5a3df48 100644 --- a/examples/additive_manufacturing/sintering_physics/inference.py +++ b/examples/additive_manufacturing/sintering_physics/inference.py @@ -32,15 +32,6 @@ "Mesh Graph Net Datapipe requires the Tensorflow library. Install the " + "package at: https://www.tensorflow.org/install" ) -physical_devices = tf.config.list_physical_devices("GPU") - -try: - for device_ in physical_devices: - tf.config.experimental.set_memory_growth(device_, True) -except: - # Invalid device or cannot modify virtual devices once initialized. - pass - import hydra import torch from graph_dataset import GraphDataset diff --git a/examples/additive_manufacturing/sintering_physics/requirements.txt b/examples/additive_manufacturing/sintering_physics/requirements.txt index 667a0024d5..58c48332e6 100644 --- a/examples/additive_manufacturing/sintering_physics/requirements.txt +++ b/examples/additive_manufacturing/sintering_physics/requirements.txt @@ -1,3 +1,6 @@ # pyvista is optional, required if need to run data preprocessing from raw simulation # pyvista==0.32.1 -tensorflow>=2.15,<3.0 # generate tfrecord +# CPU-only TF avoids the bundled CUDA 12 runtime that conflicts with the +# PhysicsNeMo container's CUDA 13 (PyTorch raises cudaErrorStubLibrary 302 +# and the process aborts). Only used here as a TFRecord parser. +tensorflow-cpu>=2.15,<3.0 diff --git a/examples/additive_manufacturing/sintering_physics/train.py b/examples/additive_manufacturing/sintering_physics/train.py index 4062337f6e..dded633280 100644 --- a/examples/additive_manufacturing/sintering_physics/train.py +++ b/examples/additive_manufacturing/sintering_physics/train.py @@ -58,15 +58,6 @@ ) from physicsnemo.models.vfgn.graph_network_modules import VFGNLearnedSimulator -physical_devices = tf.config.list_physical_devices("GPU") -try: - for device_ in physical_devices: - tf.config.experimental.set_memory_growth(device_, True) -except: - # Invalid device or cannot modify virtual devices once initialized. - pass - - def Train(rank_zero_logger, dist, cfg: DictConfig): """ Trains a graph-based model, evaluating and saving its performance periodically. diff --git a/examples/cfd/external_aerodynamics/xaeronet/README.md b/examples/cfd/external_aerodynamics/xaeronet/README.md index 9f3dd5d54c..30f67e345b 100644 --- a/examples/cfd/external_aerodynamics/xaeronet/README.md +++ b/examples/cfd/external_aerodynamics/xaeronet/README.md @@ -75,13 +75,29 @@ dataset, please refer to their [paper](https://arxiv.org/pdf/2408.11969). ## XAeroNet-S prerequisites -Install the requirements using: +Install the base requirements: ```bash pip install -r requirements.txt -pip install pyg-lib -f https://data.pyg.org/whl/torch-2.8.0+cu129.html ``` +`pyg-lib` and `torch_scatter` ship as compiled CUDA extensions and must be +installed from PyG's pre-built wheel index that matches your installed +`torch` and CUDA versions. The two-line snippet below detects both and +constructs the correct URL: + +```bash +TORCH=$(python -c "import torch; print(torch.__version__.split('+')[0])") +CUDA=$(python -c "import torch; v=torch.version.cuda; print('cu' + v.replace('.', '') if v else 'cpu')") +pip install pyg-lib torch_scatter -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html +``` + +If PyG has not published a wheel for your exact torch+CUDA combination +yet, browse to find the closest match, or +build from source with `pip install --no-build-isolation torch_scatter` +(plain `pip install torch_scatter` fails because pip's build isolation +hides the installed `torch` from the build environment). + See `pyg-lib` [installation instructions](https://github.com/pyg-team/pyg-lib?tab=readme-ov-file#installation) for more details. diff --git a/examples/cfd/external_aerodynamics/xaeronet/requirements.txt b/examples/cfd/external_aerodynamics/xaeronet/requirements.txt index 77b1dfda20..3dd86d60c1 100644 --- a/examples/cfd/external_aerodynamics/xaeronet/requirements.txt +++ b/examples/cfd/external_aerodynamics/xaeronet/requirements.txt @@ -1,6 +1,8 @@ trimesh>=4.5.0 torch_geometric>=2.6.1 -torch_scatter>=2.1.2 pyvista vtk wandb +scikit-learn +tabulate +matplotlib diff --git a/physicsnemo/datapipes/gnn/ahmed_body_dataset.py b/physicsnemo/datapipes/gnn/ahmed_body_dataset.py index f30debda1c..6cbfe17c26 100644 --- a/physicsnemo/datapipes/gnn/ahmed_body_dataset.py +++ b/physicsnemo/datapipes/gnn/ahmed_body_dataset.py @@ -47,6 +47,14 @@ logger = logging.getLogger(__name__) +def _init_pool_worker(): + # Use file-system-backed shared memory for tensors returned from this + # pool. The default file-descriptor strategy passes one FD per tensor + # via SCM_RIGHTS and trips "RuntimeError: received 0 items of ancdata" + # when RLIMIT_NOFILE is exhausted (e.g. Ubuntu's default 1024). + torch.multiprocessing.set_sharing_strategy("file_system") + + @dataclass class FileInfo: """VTP file info storage.""" @@ -204,6 +212,7 @@ def get_num_workers(): with cf.ProcessPoolExecutor( max_workers=num_workers, mp_context=torch.multiprocessing.get_context("spawn"), + initializer=_init_pool_worker, ) as executor: for i, graph, coeff, normal, area in executor.map( self.create_graph,