Assert fp32 for rope embeddings, misc test fixes (#1496)

pstjohn · web-flow · commit b2ddae1d41a2 · 2026-03-05T23:02:03.000Z
This wouldn't have caught @savitha-eng's `cast_forward_inputs=True` bug (that casts these right as they enter the TransformerLayer), but it turns out our test suite was actually casting these to bfloat16 with `model.to(bfloat16)` calls 😬 . This also fixes a few other misc. test failures I saw locally making sure the esm2 & llama3 recipe and model tests pass. will require #1495 for tests to pass --------- Signed-off-by: Peter St. John <pstjohn@nvidia.com>
diff --git a/bionemo-recipes/models/esm2/modeling_esm_te.py b/bionemo-recipes/models/esm2/modeling_esm_te.py
@@ -22,6 +22,7 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
 
+import warnings
 from typing import ClassVar, Literal, Optional, Unpack
 
 # TODO: put import guard around transformer_engine here, with an informative error message around
@@ -197,6 +198,8 @@ def forward(
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_embeddings(max_seq_len=self.config.max_position_embeddings)
             te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
+            if te_rope_emb.dtype == torch.float32:
+                warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
         for layer_module in self.layers:
             if kwargs.get("output_hidden_states", False):
@@ -374,7 +377,7 @@ def forward(
         )
         encoder_outputs = self.encoder(
             embedding_output,
-            attention_mask=extended_attention_mask,
+            attention_mask=None if self.config.attn_input_format == "thd" else extended_attention_mask,
             **kwargs,
         )
         sequence_output = encoder_outputs[0]
diff --git a/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py b/bionemo-recipes/models/esm2/tests/common/test_modeling_common.py
@@ -452,7 +452,6 @@ def test_smoke_forward_pass(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -475,7 +474,6 @@ def test_smoke_backward_pass(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -498,7 +496,6 @@ def test_smoke_model_with_loss(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data with labels
@@ -522,7 +519,6 @@ def test_forward_and_backward(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -1011,7 +1007,7 @@ def test_generate_without_cache(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="bshd", self_attn_mask_type="causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1030,7 +1026,7 @@ def test_generate_with_cache(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1051,7 +1047,7 @@ def test_generate_with_cache_batched(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1076,7 +1072,7 @@ def test_generate_with_cache_beam_search(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
diff --git a/bionemo-recipes/models/llama3/modeling_llama_te.py b/bionemo-recipes/models/llama3/modeling_llama_te.py
@@ -15,6 +15,7 @@
 
 """TransformerEngine-optimized Llama model."""
 
+import warnings
 from collections import OrderedDict
 from typing import ClassVar, Unpack
 
@@ -236,6 +237,8 @@ def forward(
         # Ensure that rotary embeddings are computed with at a higher precision
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_emb(max_seq_len=self.config.max_position_embeddings)
+            if te_rope_emb.dtype == torch.float32:
+                warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
diff --git a/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py b/bionemo-recipes/models/llama3/tests/common/test_modeling_common.py
@@ -452,7 +452,6 @@ def test_smoke_forward_pass(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -475,7 +474,6 @@ def test_smoke_backward_pass(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -498,7 +496,6 @@ def test_smoke_model_with_loss(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data with labels
@@ -522,7 +519,6 @@ def test_forward_and_backward(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -1011,7 +1007,7 @@ def test_generate_without_cache(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="bshd", self_attn_mask_type="causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1030,7 +1026,7 @@ def test_generate_with_cache(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1051,7 +1047,7 @@ def test_generate_with_cache_batched(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1076,7 +1072,7 @@ def test_generate_with_cache_beam_search(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
diff --git a/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py b/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py
@@ -452,7 +452,6 @@ def test_smoke_forward_pass(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -475,7 +474,6 @@ def test_smoke_backward_pass(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -498,7 +496,6 @@ def test_smoke_model_with_loss(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data with labels
@@ -522,7 +519,6 @@ def test_forward_and_backward(self, input_format):
         config = self.create_test_config(attn_input_format=input_format)
 
         model = model_class(config)
-        model.to(torch.bfloat16)
         model.to("cuda")
 
         # Prepare input data
@@ -1011,7 +1007,7 @@ def test_generate_without_cache(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="bshd", self_attn_mask_type="causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1030,7 +1026,7 @@ def test_generate_with_cache(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1051,7 +1047,7 @@ def test_generate_with_cache_batched(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
@@ -1076,7 +1072,7 @@ def test_generate_with_cache_beam_search(self):
             pytest.skip("Not an autoregressive model")
 
         config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
-        model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
+        model = self.get_model_class()(config).to("cuda")
         model.eval()
 
         tokenizer = self.get_tokenizer()
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_accelerate_te/example_8m_checkpoint/esm_nv.py
@@ -22,6 +22,7 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
 
+import warnings
 from typing import ClassVar, Literal, Optional, Unpack
 
 # TODO: put import guard around transformer_engine here, with an informative error message around
@@ -197,6 +198,8 @@ def forward(
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_embeddings(max_seq_len=self.config.max_position_embeddings)
             te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
+            if te_rope_emb.dtype == torch.float32:
+                warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
         for layer_module in self.layers:
             if kwargs.get("output_hidden_states", False):
@@ -374,7 +377,7 @@ def forward(
         )
         encoder_outputs = self.encoder(
             embedding_output,
-            attention_mask=extended_attention_mask,
+            attention_mask=None if self.config.attn_input_format == "thd" else extended_attention_mask,
             **kwargs,
         )
         sequence_output = encoder_outputs[0]
diff --git a/bionemo-recipes/recipes/esm2_native_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_native_te/example_8m_checkpoint/esm_nv.py
@@ -22,6 +22,7 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
 
+import warnings
 from typing import ClassVar, Literal, Optional, Unpack
 
 # TODO: put import guard around transformer_engine here, with an informative error message around
@@ -197,6 +198,8 @@ def forward(
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_embeddings(max_seq_len=self.config.max_position_embeddings)
             te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
+            if te_rope_emb.dtype == torch.float32:
+                warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
         for layer_module in self.layers:
             if kwargs.get("output_hidden_states", False):
@@ -374,7 +377,7 @@ def forward(
         )
         encoder_outputs = self.encoder(
             embedding_output,
-            attention_mask=extended_attention_mask,
+            attention_mask=None if self.config.attn_input_format == "thd" else extended_attention_mask,
             **kwargs,
         )
         sequence_output = encoder_outputs[0]
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_dataset.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_dataset.py
@@ -964,3 +964,6 @@ def test_cp_dataloader(recipe_path):
         f"Expected at most {expected_tokens_per_rank + 100} tokens, got {actual_shape}"
     )
     assert batch["labels"].shape[1] == actual_shape
+
+    dataloader.close()
+    torch.distributed.destroy_process_group()
diff --git a/bionemo-recipes/recipes/esm2_peft_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_peft_te/example_8m_checkpoint/esm_nv.py
@@ -22,6 +22,7 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
 
+import warnings
 from typing import ClassVar, Literal, Optional, Unpack
 
 # TODO: put import guard around transformer_engine here, with an informative error message around
@@ -197,6 +198,8 @@ def forward(
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_embeddings(max_seq_len=self.config.max_position_embeddings)
             te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
+            if te_rope_emb.dtype == torch.float32:
+                warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
         for layer_module in self.layers:
             if kwargs.get("output_hidden_states", False):
@@ -374,7 +377,7 @@ def forward(
         )
         encoder_outputs = self.encoder(
             embedding_output,
-            attention_mask=extended_attention_mask,
+            attention_mask=None if self.config.attn_input_format == "thd" else extended_attention_mask,
             **kwargs,
         )
         sequence_output = encoder_outputs[0]
diff --git a/bionemo-recipes/recipes/esm2_peft_te/tests/test_infer.py b/bionemo-recipes/recipes/esm2_peft_te/tests/test_infer.py
@@ -33,7 +33,7 @@ def peft_model(recipe_path):
     config.id2label = SS3_ID2LABEL
     config.label2id = SS3_LABEL2ID
 
-    base_model = AutoModelForTokenClassification.from_config(config, trust_remote_code=True)
+    base_model = AutoModelForTokenClassification.from_config(config, trust_remote_code=True, dtype=torch.bfloat16)
 
     lora_config = peft.LoraConfig(
         task_type=peft.TaskType.TOKEN_CLS,
@@ -45,7 +45,7 @@ def peft_model(recipe_path):
     )
 
     model = peft.get_peft_model(base_model, lora_config)
-    model.to(device="cuda", dtype=torch.bfloat16)
+    model.to(device="cuda")
     model.eval()
     return model
 
diff --git a/bionemo-recipes/recipes/esm2_peft_te/train_lora_ddp.py b/bionemo-recipes/recipes/esm2_peft_te/train_lora_ddp.py
@@ -78,7 +78,7 @@ def main(args: DictConfig) -> float:
     device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("ddp",))
 
     # For testing, we don't want to depend on loading pre-trained weights.
-    config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True, dtype=torch.bfloat16)
     if args.use_sequence_packing:
         config.attn_input_format = "thd"
 
@@ -106,7 +106,7 @@ def main(args: DictConfig) -> float:
     )
 
     peft_model = peft.get_peft_model(model, peft_config)
-    peft_model.to(device=device, dtype=torch.bfloat16)
+    peft_model.to(device=device)
 
     print("----- PEFT Model --------")
     peft_model.print_trainable_parameters()
diff --git a/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py
@@ -15,6 +15,7 @@
 
 """TransformerEngine-optimized Llama model."""
 
+import warnings
 from collections import OrderedDict
 from typing import ClassVar, Unpack
 
@@ -236,6 +237,8 @@ def forward(
         # Ensure that rotary embeddings are computed with at a higher precision
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_emb(max_seq_len=self.config.max_position_embeddings)
+            if te_rope_emb.dtype == torch.float32:
+                warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:

Original file line number	Diff line number	Diff line change
`@@ -964,3 +964,6 @@ def test_cp_dataloader(recipe_path):`
`964`	`964`	`f"Expected at most {expected_tokens_per_rank + 100} tokens, got {actual_shape}"`
`965`	`965`	`)`
`966`	`966`	`assert batch["labels"].shape[1] == actual_shape`
	`967`	`+`
	`968`	`+ dataloader.close()`
	`969`	`+ torch.distributed.destroy_process_group()`