Skip to content

Commit 203a999

Browse files
KumoLiupre-commit-ci[bot]dongyang0122
authored
Fix ValueError in maisi_train_controlnet_tutorial.ipynb (#1839)
Fixes #1838 ### Description - Upload to avoid torch.cuda deprecated error from GradScaler and autocast. - Update to remove file unclose error. - Update dim to avoid value error in controlnet tutorial. - Add multi-gpu check to avoid dist warning. ### Checks <!--- Put an `x` in all the boxes that apply, and remove the not applicable items --> - [x] Avoid including large-size files in the PR. - [x] Clean up long text outputs from code cells in the notebook. - [x] For security purposes, please check the contents and remove any sensitive info such as user names and private key. - [x] Ensure (1) hyperlinks and markdown anchors are working (2) use relative paths for tutorial repo files (3) put figure and graphs in the `./figure` folder - [ ] Notebook runs automatically `./runner.sh -t <path to .ipynb file>` --------- Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Dong Yang <don.yang.mech@gmail.com>
1 parent 00484e0 commit 203a999

11 files changed

+264
-223
lines changed

generation/maisi/maisi_diff_unet_training_tutorial.ipynb

Lines changed: 61 additions & 114 deletions
Large diffs are not rendered by default.

generation/maisi/maisi_inference_tutorial.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -364,25 +364,25 @@
364364
"device = torch.device(\"cuda\")\n",
365365
"\n",
366366
"autoencoder = define_instance(args, \"autoencoder_def\").to(device)\n",
367-
"checkpoint_autoencoder = torch.load(args.trained_autoencoder_path)\n",
367+
"checkpoint_autoencoder = torch.load(args.trained_autoencoder_path, weights_only=True)\n",
368368
"autoencoder.load_state_dict(checkpoint_autoencoder)\n",
369369
"\n",
370370
"diffusion_unet = define_instance(args, \"diffusion_unet_def\").to(device)\n",
371-
"checkpoint_diffusion_unet = torch.load(args.trained_diffusion_path)\n",
371+
"checkpoint_diffusion_unet = torch.load(args.trained_diffusion_path, weights_only=False)\n",
372372
"diffusion_unet.load_state_dict(checkpoint_diffusion_unet[\"unet_state_dict\"], strict=True)\n",
373373
"scale_factor = checkpoint_diffusion_unet[\"scale_factor\"].to(device)\n",
374374
"\n",
375375
"controlnet = define_instance(args, \"controlnet_def\").to(device)\n",
376-
"checkpoint_controlnet = torch.load(args.trained_controlnet_path)\n",
376+
"checkpoint_controlnet = torch.load(args.trained_controlnet_path, weights_only=False)\n",
377377
"monai.networks.utils.copy_model_state(controlnet, diffusion_unet.state_dict())\n",
378378
"controlnet.load_state_dict(checkpoint_controlnet[\"controlnet_state_dict\"], strict=True)\n",
379379
"\n",
380380
"mask_generation_autoencoder = define_instance(args, \"mask_generation_autoencoder_def\").to(device)\n",
381-
"checkpoint_mask_generation_autoencoder = torch.load(args.trained_mask_generation_autoencoder_path)\n",
381+
"checkpoint_mask_generation_autoencoder = torch.load(args.trained_mask_generation_autoencoder_path, weights_only=True)\n",
382382
"mask_generation_autoencoder.load_state_dict(checkpoint_mask_generation_autoencoder)\n",
383383
"\n",
384384
"mask_generation_diffusion_unet = define_instance(args, \"mask_generation_diffusion_def\").to(device)\n",
385-
"checkpoint_mask_generation_diffusion_unet = torch.load(args.trained_mask_generation_diffusion_path)\n",
385+
"checkpoint_mask_generation_diffusion_unet = torch.load(args.trained_mask_generation_diffusion_path, weights_only=True)\n",
386386
"mask_generation_diffusion_unet.load_state_dict(checkpoint_mask_generation_diffusion_unet[\"unet_state_dict\"])\n",
387387
"mask_generation_scale_factor = checkpoint_mask_generation_diffusion_unet[\"scale_factor\"]\n",
388388
"\n",

generation/maisi/maisi_train_controlnet_tutorial.ipynb

Lines changed: 110 additions & 52 deletions
Large diffs are not rendered by default.

generation/maisi/maisi_train_vae_tutorial.ipynb

Lines changed: 33 additions & 25 deletions
Large diffs are not rendered by default.

generation/maisi/scripts/diff_model_create_training_data.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import nibabel as nib
2121
import numpy as np
2222
import torch
23+
import torch.distributed as dist
2324

2425
import monai
2526
from monai.transforms import Compose
@@ -146,7 +147,7 @@ def process_file(
146147
out_path.parent.mkdir(parents=True, exist_ok=True)
147148
logger.info(f"out_filename: {out_filename}")
148149

149-
with torch.cuda.amp.autocast():
150+
with torch.amp.autocast("cuda"):
150151
pt_nda = torch.from_numpy(nda_image).float().to(device).unsqueeze(0).unsqueeze(0)
151152
z = autoencoder.encode_stage_2_inputs(pt_nda)
152153
logger.info(f"z: {z.size()}, {z.dtype}")
@@ -175,7 +176,7 @@ def diff_model_create_training_data(env_config_path: str, model_config_path: str
175176

176177
autoencoder = define_instance(args, "autoencoder_def").to(device)
177178
try:
178-
checkpoint_autoencoder = torch.load(args.trained_autoencoder_path)
179+
checkpoint_autoencoder = torch.load(args.trained_autoencoder_path, weights_only=True)
179180
autoencoder.load_state_dict(checkpoint_autoencoder)
180181
except Exception:
181182
logger.error("The trained_autoencoder_path does not exist!")
@@ -202,6 +203,9 @@ def diff_model_create_training_data(env_config_path: str, model_config_path: str
202203

203204
process_file(filepath, args, autoencoder, device, plain_transforms, new_transforms, logger)
204205

206+
if dist.is_initialized():
207+
dist.destroy_process_group()
208+
205209

206210
if __name__ == "__main__":
207211
parser = argparse.ArgumentParser(description="Diffusion Model Training Data Creation")

generation/maisi/scripts/diff_model_infer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import nibabel as nib
2121
import numpy as np
2222
import torch
23+
import torch.distributed as dist
2324
from tqdm import tqdm
2425

2526
from monai.inferers import sliding_window_inference
@@ -59,13 +60,13 @@ def load_models(args: argparse.Namespace, device: torch.device, logger: logging.
5960
"""
6061
autoencoder = define_instance(args, "autoencoder_def").to(device)
6162
try:
62-
checkpoint_autoencoder = torch.load(args.trained_autoencoder_path)
63+
checkpoint_autoencoder = torch.load(args.trained_autoencoder_path, weights_only=True)
6364
autoencoder.load_state_dict(checkpoint_autoencoder)
6465
except Exception:
6566
logger.error("The trained_autoencoder_path does not exist!")
6667

6768
unet = define_instance(args, "diffusion_unet_def").to(device)
68-
checkpoint = torch.load(f"{args.model_dir}/{args.model_filename}", map_location=device)
69+
checkpoint = torch.load(f"{args.model_dir}/{args.model_filename}", map_location=device, weights_only=False)
6970
unet.load_state_dict(checkpoint["unet_state_dict"], strict=True)
7071
logger.info(f"checkpoints {args.model_dir}/{args.model_filename} loaded.")
7172

@@ -149,7 +150,7 @@ def run_inference(
149150
autoencoder.eval()
150151
unet.eval()
151152

152-
with torch.cuda.amp.autocast(enabled=True):
153+
with torch.amp.autocast("cuda", enabled=True):
153154
for t in tqdm(noise_scheduler.timesteps, ncols=110):
154155
model_output = unet(
155156
x=image,
@@ -271,7 +272,7 @@ def diff_model_infer(env_config_path: str, model_config_path: str, model_def_pat
271272
)
272273

273274
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
274-
output_path = "{0}/{1}_seed{2}_size{3:d}x{4:d}x{5:d}_spacing{6:.2f}x{7:.2f}x{8:.2f}_{9}.nii.gz".format(
275+
output_path = "{0}/{1}_seed{2}_size{3:d}x{4:d}x{5:d}_spacing{6:.2f}x{7:.2f}x{8:.2f}_{9}_rank{10}.nii.gz".format(
275276
args.output_dir,
276277
output_prefix,
277278
random_seed,
@@ -282,9 +283,13 @@ def diff_model_infer(env_config_path: str, model_config_path: str, model_def_pat
282283
out_spacing[1],
283284
out_spacing[2],
284285
timestamp,
286+
local_rank,
285287
)
286288
save_image(data, output_size, out_spacing, output_path, logger)
287289

290+
if dist.is_initialized():
291+
dist.destroy_process_group()
292+
288293

289294
if __name__ == "__main__":
290295
parser = argparse.ArgumentParser(description="Diffusion Model Inference")

generation/maisi/scripts/diff_model_setting.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ def setup_logging(logger_name: str = "") -> logging.Logger:
3232
logging.Logger: Configured logger.
3333
"""
3434
logger = logging.getLogger(logger_name)
35-
logger.addFilter(RankFilter())
35+
if dist.is_initialized():
36+
logger.addFilter(RankFilter())
3637
logging.basicConfig(
3738
level=logging.INFO,
3839
format="[%(asctime)s.%(msecs)03d][%(levelname)5s](%(name)s) - %(message)s",
@@ -80,9 +81,13 @@ def initialize_distributed() -> tuple:
8081
Returns:
8182
tuple: local_rank, world_size, and device.
8283
"""
83-
dist.init_process_group(backend="nccl", init_method="env://")
84-
local_rank = dist.get_rank()
85-
world_size = dist.get_world_size()
84+
if torch.cuda.is_available() and torch.cuda.device_count() > 1:
85+
dist.init_process_group(backend="nccl", init_method="env://")
86+
local_rank = dist.get_rank()
87+
world_size = dist.get_world_size()
88+
else:
89+
local_rank = 0
90+
world_size = 1
8691
device = torch.device("cuda", local_rank)
8792
torch.cuda.set_device(device)
8893
return local_rank, world_size, device

generation/maisi/scripts/diff_model_train.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import torch
2222
import torch.distributed as dist
23-
from torch.cuda.amp import GradScaler, autocast
23+
from torch.amp import GradScaler, autocast
2424
from torch.nn.parallel import DistributedDataParallel
2525

2626
import monai
@@ -64,17 +64,22 @@ def prepare_data(
6464
Returns:
6565
ThreadDataLoader: Data loader for training.
6666
"""
67+
68+
def _load_data_from_file(file_path, key):
69+
with open(file_path) as f:
70+
return torch.FloatTensor(json.load(f)[key])
71+
6772
train_transforms = Compose(
6873
[
6974
monai.transforms.LoadImaged(keys=["image"]),
7075
monai.transforms.EnsureChannelFirstd(keys=["image"]),
7176
monai.transforms.Lambdad(
72-
keys="top_region_index", func=lambda x: torch.FloatTensor(json.load(open(x))["top_region_index"])
77+
keys="top_region_index", func=lambda x: _load_data_from_file(x, "top_region_index")
7378
),
7479
monai.transforms.Lambdad(
75-
keys="bottom_region_index", func=lambda x: torch.FloatTensor(json.load(open(x))["bottom_region_index"])
80+
keys="bottom_region_index", func=lambda x: _load_data_from_file(x, "bottom_region_index")
7681
),
77-
monai.transforms.Lambdad(keys="spacing", func=lambda x: torch.FloatTensor(json.load(open(x))["spacing"])),
82+
monai.transforms.Lambdad(keys="spacing", func=lambda x: _load_data_from_file(x, "spacing")),
7883
monai.transforms.Lambdad(keys="top_region_index", func=lambda x: x * 1e2),
7984
monai.transforms.Lambdad(keys="bottom_region_index", func=lambda x: x * 1e2),
8085
monai.transforms.Lambdad(keys="spacing", func=lambda x: x * 1e2),
@@ -231,7 +236,7 @@ def train_one_epoch(
231236

232237
optimizer.zero_grad(set_to_none=True)
233238

234-
with autocast(enabled=True):
239+
with autocast("cuda", enabled=True):
235240
noise = torch.randn(
236241
(num_images_per_batch, 4, images.size(-3), images.size(-2), images.size(-1)), device=device
237242
)
@@ -365,7 +370,7 @@ def diff_model_train(env_config_path: str, model_config_path: str, model_def_pat
365370
]
366371
lr_scheduler = create_lr_scheduler(optimizer, total_steps)
367372
loss_pt = torch.nn.L1Loss()
368-
scaler = GradScaler()
373+
scaler = GradScaler("cuda")
369374

370375
torch.set_float32_matmul_precision("highest")
371376
logger.info("torch.set_float32_matmul_precision -> highest.")
@@ -403,6 +408,9 @@ def diff_model_train(env_config_path: str, model_config_path: str, model_def_pat
403408
args,
404409
)
405410

411+
if dist.is_initialized():
412+
dist.destroy_process_group()
413+
406414

407415
if __name__ == "__main__":
408416
parser = argparse.ArgumentParser(description="Diffusion Model Training")

generation/maisi/scripts/infer_controlnet.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,12 @@ def main():
6969
logger.info(f"Number of GPUs: {torch.cuda.device_count()}")
7070
logger.info(f"World_size: {world_size}")
7171

72-
env_dict = json.load(open(args.environment_file, "r"))
73-
config_dict = json.load(open(args.config_file, "r"))
74-
training_config_dict = json.load(open(args.training_config, "r"))
72+
with open(args.environment_file, "r") as env_file:
73+
env_dict = json.load(env_file)
74+
with open(args.config_file, "r") as config_file:
75+
config_dict = json.load(config_file)
76+
with open(args.training_config, "r") as training_config_file:
77+
training_config_dict = json.load(training_config_file)
7578

7679
for k, v in env_dict.items():
7780
setattr(args, k, v)

generation/maisi/scripts/sample.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def ldm_conditional_sample_one_mask(
117117
"""
118118
recon_model = ReconModel(autoencoder=autoencoder, scale_factor=scale_factor).to(device)
119119

120-
with torch.no_grad(), torch.cuda.amp.autocast():
120+
with torch.no_grad(), torch.amp.autocast("cuda"):
121121
# Generate random noise
122122
latents = initialize_noise_latents(latent_shape, device)
123123
anatomy_size = torch.FloatTensor(anatomy_size).unsqueeze(0).unsqueeze(0).half().to(device)
@@ -226,7 +226,7 @@ def ldm_conditional_sample_one_image(
226226

227227
recon_model = ReconModel(autoencoder=autoencoder, scale_factor=scale_factor).to(device)
228228

229-
with torch.no_grad(), torch.cuda.amp.autocast():
229+
with torch.no_grad(), torch.amp.autocast("cuda"):
230230
logging.info("---- Start generating latent features... ----")
231231
start_time = time.time()
232232
# generate segmentation mask

0 commit comments

Comments
 (0)