Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions examples/cfd/external_aerodynamics/globe/airfrans/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ CPUS_PER_NODE=${SLURM_CPUS_ON_NODE:-$(nproc)}
export OMP_NUM_THREADS=1
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS (process-level parallelism via DataLoader workers; ${CPUS_PER_NODE} CPUs / ${NUM_GPUS_PER_NODE} GPUs)"

### [CUDA Allocator]
# expandable_segments: avoids the synchronizing cudaMalloc/cudaFree round-trips
# that the default segment allocator performs when chunked kernel evaluations
# stress the cache. Lets the allocator grow segments instead.
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
echo "PYTORCH_CUDA_ALLOC_CONF=$PYTORCH_CUDA_ALLOC_CONF"

### [Sync Dependencies]
# Select the right CUDA extra based on the detected driver version,
# then install both the project deps and example-specific requirements.
Expand Down
15 changes: 6 additions & 9 deletions examples/cfd/external_aerodynamics/globe/airfrans/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def main(
n_spherical_harmonics: int = 1,
theta: float = 0.0,
leaf_size: int = 1,
tree_build_device: Literal["cpu", "cuda"] | None = None,
airfrans_task: Literal["full", "scarce", "reynolds", "aoa"] = "full",
patience_steps: int = 1600,
use_profiler: bool = True,
Expand Down Expand Up @@ -140,6 +141,8 @@ def main(
n_spherical_harmonics: Number of Legendre polynomial terms for angle features.
theta: Barnes-Hut opening angle. Larger = more aggressive approximation.
leaf_size: Maximum sources per leaf node in the Barnes-Hut tree.
tree_build_device: Device on which to build cluster trees and run the
dual-tree Barnes-Hut traversal. ``None`` (default) uses the input's device.
airfrans_task: Which AirFRANS dataset task to train on.
patience_steps: ReduceLROnPlateau patience expressed in gradient
steps (world-size independent). Converted to epochs internally.
Expand Down Expand Up @@ -270,6 +273,7 @@ def main(
self_regularization_beta=self_regularization_beta,
latent_compression_scale=latent_compression_scale,
expand_far_targets=expand_far_targets,
tree_build_device=tree_build_device,
).to(device)

logger0.info(f"{output_dir.name=!r}")
Expand Down Expand Up @@ -347,16 +351,13 @@ def main(
min_lr=learning_rate / 64,
threshold=1e-3,
)
scaler = torch.amp.GradScaler(device=device.type, enabled=amp)

Comment thread
peterdsharpe marked this conversation as resolved.
### [Checkpoint Save/Load]
metadata_dict: dict[str, Any] = {}
epoch = load_checkpoint(
checkpoint_dir,
models=base_model,
optimizer=optimizer,
scheduler=scheduler,
scaler=scaler,
metadata_dict=metadata_dict,
device=dist.device,
)
Expand Down Expand Up @@ -430,7 +431,6 @@ def main(
**config_settings,
"optimizer": optimizer.__class__.__name__,
"scheduler": scheduler.__class__.__name__,
"scaler": scaler.__class__.__name__,
"physicsnemo_pkg_info": get_physicsnemo_pkg_info(),
"world_size": dist.world_size,
**{f"n_{split}_samples": len(sample_paths[split]) for split in splits},
Expand Down Expand Up @@ -526,15 +526,13 @@ def prepare_sample(sample: AirFRANSSample) -> AirFRANSSample:
if torch.isnan(batch_loss):
warnings.warn(f"{batch_loss=} at: {dist.rank=}, {epoch=}")
with record_function("backward"):
scaler.scale(batch_loss).backward()
batch_loss.backward()
if gradient_clip_norm is not None:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(
model.parameters(), max_norm=gradient_clip_norm
)
with record_function("optimizer_step"):
scaler.step(optimizer)
scaler.update()
optimizer.step()
all_batch_losses.append(batch_loss.detach().clone())
for k, v in batch_loss_components.items():
all_batch_loss_components[k].append(v.detach().clone())
Expand Down Expand Up @@ -614,7 +612,6 @@ def save_ckpt() -> None:
models=base_model,
optimizer=optimizer,
scheduler=scheduler,
scaler=scaler,
epoch=epoch,
metadata=checkpoint_metadata(),
)
Expand Down
7 changes: 7 additions & 0 deletions examples/cfd/external_aerodynamics/globe/drivaer/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ CPUS_PER_NODE=${SLURM_CPUS_ON_NODE:-$(nproc)}
export OMP_NUM_THREADS=1
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS (process-level parallelism via DataLoader workers; ${CPUS_PER_NODE} CPUs / ${NUM_GPUS_PER_NODE} GPUs)"

### [CUDA Allocator]
# expandable_segments: avoids the synchronizing cudaMalloc/cudaFree round-trips
# that the default segment allocator performs when chunked kernel evaluations
# stress the cache. Lets the allocator grow segments instead.
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
echo "PYTORCH_CUDA_ALLOC_CONF=$PYTORCH_CUDA_ALLOC_CONF"

### [Sync Dependencies]
# Select the right CUDA extra based on the detected driver version,
# then install both the project deps and example-specific requirements.
Expand Down
15 changes: 6 additions & 9 deletions examples/cfd/external_aerodynamics/globe/drivaer/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def main(
n_spherical_harmonics: int = 4,
theta: float = 1.0,
leaf_size: int = 1,
tree_build_device: Literal["cpu", "cuda"] | None = None,
n_faces_per_boundary: int = 80_000,
patience_steps: int = 1600,
use_profiler: bool = True,
Expand Down Expand Up @@ -152,6 +153,8 @@ def main(
theta: Barnes-Hut opening angle. Larger values are more
aggressive (more approximation, faster). 0 = exact.
leaf_size: Maximum sources per leaf node in the Barnes-Hut tree.
tree_build_device: Device on which to build cluster trees and run the
dual-tree Barnes-Hut traversal. ``None`` (default) uses the input's device.
n_faces_per_boundary: Target boundary mesh face count after decimation.
patience_steps: ReduceLROnPlateau patience expressed in gradient
steps (world-size independent). Converted to epochs internally.
Expand Down Expand Up @@ -281,6 +284,7 @@ def main(
self_regularization_beta=self_regularization_beta,
latent_compression_scale=latent_compression_scale,
expand_far_targets=expand_far_targets,
tree_build_device=tree_build_device,
).to(device)

logger0.info(f"{output_dir.name=!r}")
Expand Down Expand Up @@ -358,16 +362,13 @@ def main(
min_lr=learning_rate / 64,
threshold=1e-3,
)
scaler = torch.amp.GradScaler(device=device.type, enabled=amp)

### [Checkpoint Save/Load]
metadata_dict: dict[str, Any] = {}
epoch = load_checkpoint(
checkpoint_dir,
models=base_model,
optimizer=optimizer,
scheduler=scheduler,
scaler=scaler,
metadata_dict=metadata_dict,
device=dist.device,
)
Expand Down Expand Up @@ -438,7 +439,6 @@ def main(
**config_settings,
"optimizer": optimizer.__class__.__name__,
"scheduler": scheduler.__class__.__name__,
"scaler": scaler.__class__.__name__,
"physicsnemo_pkg_info": get_physicsnemo_pkg_info(),
"world_size": dist.world_size,
**{f"n_{split}_samples": len(sample_paths[split]) for split in splits},
Expand Down Expand Up @@ -505,15 +505,13 @@ def run_epoch(split: Split) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
if torch.isnan(batch_loss):
warnings.warn(f"{batch_loss=} at: {dist.rank=}, {epoch=}")
with record_function("backward"):
scaler.scale(batch_loss).backward()
batch_loss.backward()
if gradient_clip_norm is not None:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(
model.parameters(), max_norm=gradient_clip_norm
)
with record_function("optimizer_step"):
scaler.step(optimizer)
scaler.update()
optimizer.step()

all_batch_losses.append(batch_loss.detach().clone())
for k, v in batch_loss_components.items():
Expand Down Expand Up @@ -594,7 +592,6 @@ def save_ckpt() -> None:
models=base_model,
optimizer=optimizer,
scheduler=scheduler,
scaler=scaler,
epoch=epoch,
metadata=checkpoint_metadata(),
)
Expand Down
Loading
Loading