NVIDIA
diff --git a/‎bionemo-recipes/models/amplify/src/amplify/state.py‎
Lines changed: 8 additions & 3 deletions b/‎bionemo-recipes/models/amplify/src/amplify/state.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 60 additions & 17 deletions b/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 60 additions & 17 deletions
diff --git a/‎bionemo-recipes/models/esm2/src/esm/state.py‎
Lines changed: 8 additions & 3 deletions b/‎bionemo-recipes/models/esm2/src/esm/state.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/common/README.md‎
Lines changed: 1 addition & 1 deletion b/‎bionemo-recipes/models/esm2/tests/common/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes/models/esm2/tests/common/__init__.py‎
Lines changed: 0 additions & 15 deletions b/‎bionemo-recipes/models/esm2/tests/common/__init__.py‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/common/fixtures.py‎
Lines changed: 3 additions & 18 deletions b/‎bionemo-recipes/models/esm2/tests/common/fixtures.py‎
Lines changed: 3 additions & 18 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/common/test_modeling_common.py‎
Lines changed: 8 additions & 4 deletions b/‎bionemo-recipes/models/esm2/tests/common/test_modeling_common.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎bionemo-recipes/models/llama3/collator.py‎
Lines changed: 60 additions & 17 deletions b/‎bionemo-recipes/models/llama3/collator.py‎
Lines changed: 60 additions & 17 deletions
@@ -67,8 +67,8 @@ def apply_transforms(
     source: Union[nn.Module, _ModelState],
     target: TargetModuleT,
     mapping: Dict[str, str],
-    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
-    state_dict_ignored_entries: List = [],
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+    state_dict_ignored_entries: Optional[List] = None,
     cast_dtype: Optional[torch.dtype] = None,
 ) -> TargetModuleT:
     """Transform the state dictionary of a source module to match the structure of a target module's state dictionary.
@@ -126,6 +126,11 @@ def scale_weights(ctx):
         This function is particularly useful when adapting models from different frameworks or
         when consolidating models with different architectural changes.
     """
+    if transforms is None:
+        transforms = []
+    if state_dict_ignored_entries is None:
+        state_dict_ignored_entries = []
+
     # Track dtypes to make sure they weren't modified during conversion.
     target_orig_dtypes = extract_dtypes(target.named_parameters())
 
@@ -318,7 +323,7 @@ def __call__(self, ctx: TransformCTX) -> TransformCTX:
                     try:
                         source_match = source_matches[target_index]
                     except IndexError as e:
-                        logger.error(f"Enountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
+                        logger.error(f"Encountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
                         raise e
                     if accepts_var_args:
                         source_values = [source_dict[k] for k in source_match]
 
@@ -154,8 +154,11 @@ def __call__(self, features, return_tensors=None):
             sequence processing capabilities. When pad_to_multiple_of is used, an additional
             mock sequence is appended to reach the desired total length.
         """
+        if return_tensors is not None and return_tensors != "pt":
+            raise NotImplementedError(f"Only return_tensors='pt' is supported, got '{return_tensors}'")
+
         # Perform the masking with the BSHD collator.
-        bshd_batch = self.collator(features)
+        bshd_batch = self.collator(features, return_tensors=return_tensors)
 
         # Create the flattened batch to get the cu_seq_lens_q and cu_seq_lens_k values.
         packed_batch = _pt_flatten_collate(features, return_position_ids=self.return_position_ids)
@@ -247,29 +250,68 @@ def __iter__(self):
         samples = []
         current_length = 0
         for sample in iter(self.dataset):
-            current_length += len(sample["input_ids"])
+            sample_length = len(sample["input_ids"])
+            current_length += sample_length
+
             if current_length == self.max_tokens_per_batch:
                 yield [*samples, sample]
                 samples = []
                 current_length = 0
 
             elif current_length > self.max_tokens_per_batch:
-                if not self.split_samples:
-                    # If we are not splitting samples, we can just yield the current batch (before this sample) and
-                    # start a new one.
-                    yield samples
-                    samples = [sample]
+                tokens_available = self.max_tokens_per_batch - (current_length - sample_length)
+
+                if tokens_available <= 0:
+                    # Current batch is already full (or over); yield it first, then handle this sample.
+                    if samples:
+                        yield samples
+                    samples = []
+                    current_length = sample_length
+                    tokens_available = self.max_tokens_per_batch
+
+                    # Now handle the incoming sample with a fresh batch.
+                    if sample_length == self.max_tokens_per_batch:
+                        yield [sample]
+                        samples = []
+                        current_length = 0
+                        continue
+                    elif sample_length < self.max_tokens_per_batch:
+                        samples = [sample]
+                        continue
+                    # sample_length > max_tokens_per_batch: fall through to split logic below
 
+                if not self.split_samples:
+                    # Yield the current batch (before this sample) and start a new one with this sample.
+                    if samples:
+                        yield samples
+                    # The sample itself may exceed max_tokens_per_batch; yield it as its own batch.
+                    if sample_length > self.max_tokens_per_batch:
+                        yield [sample]
+                        samples = []
+                        current_length = 0
+                    else:
+                        samples = [sample]
+                        current_length = sample_length
                 else:
-                    # Calculate how many tokens are already in the batch
-                    tokens_in_batch = current_length - len(sample["input_ids"])
-                    # Calculate how many tokens we can fit from this sample
-                    tokens_available = self.max_tokens_per_batch - tokens_in_batch
-                    first_part, remaining_part = _split_sample_by_num_tokens(sample, tokens_available)
-                    yield [*samples, first_part]
-                    samples = [remaining_part]
-
-                current_length = len(samples[0]["input_ids"])
+                    # Split mode: fill the current batch, then split remaining into chunks.
+                    if tokens_available > 0 and tokens_available < sample_length:
+                        first_part, remaining = _split_sample_by_num_tokens(sample, tokens_available)
+                        yield [*samples, first_part]
+                    else:
+                        # tokens_available >= sample_length shouldn't happen here, but guard anyway
+                        if samples:
+                            yield samples
+                        remaining = sample
+
+                    # Now split the remaining part into chunks of max_tokens_per_batch.
+                    while len(remaining["input_ids"]) > self.max_tokens_per_batch:
+                        chunk, remaining = _split_sample_by_num_tokens(remaining, self.max_tokens_per_batch)
+                        yield [chunk]
+
+                    samples = [remaining]
+                    current_length = len(remaining["input_ids"])
+                    continue
+
             else:
                 samples.append(sample)
 
@@ -345,7 +387,8 @@ def __call__(self, features) -> list[dict[str, Any]]:
             else:
                 raise ValueError(f"Unsupported qvk_format: {self.qkv_format}!")
 
-            batch_shard["max_length_k"] = batch_shard["max_length_q"] = max_length * round(max_length / 64)
+            padded_max = ((max_length + 63) // 64) * 64
+            batch_shard["max_length_k"] = batch_shard["max_length_q"] = padded_max
             combined_batch.append(batch_shard)
 
         return combined_batch
 
@@ -67,8 +67,8 @@ def apply_transforms(
     source: Union[nn.Module, _ModelState],
     target: TargetModuleT,
     mapping: Dict[str, str],
-    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
-    state_dict_ignored_entries: List = [],
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+    state_dict_ignored_entries: Optional[List] = None,
     cast_dtype: Optional[torch.dtype] = None,
 ) -> TargetModuleT:
     """Transform the state dictionary of a source module to match the structure of a target module's state dictionary.
@@ -126,6 +126,11 @@ def scale_weights(ctx):
         This function is particularly useful when adapting models from different frameworks or
         when consolidating models with different architectural changes.
     """
+    if transforms is None:
+        transforms = []
+    if state_dict_ignored_entries is None:
+        state_dict_ignored_entries = []
+
     # Track dtypes to make sure they weren't modified during conversion.
     target_orig_dtypes = extract_dtypes(target.named_parameters())
 
@@ -318,7 +323,7 @@ def __call__(self, ctx: TransformCTX) -> TransformCTX:
                     try:
                         source_match = source_matches[target_index]
                     except IndexError as e:
-                        logger.error(f"Enountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
+                        logger.error(f"Encountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
                         raise e
                     if accepts_var_args:
                         source_values = [source_dict[k] for k in source_match]
 
@@ -4,7 +4,7 @@ Shared test infrastructure for BioNeMo models. One base class, **BaseModelTest**
 
 ## Structure
 
-```
+```text
 tests/common/
 ├── __init__.py             # Public API exports
 ├── test_modeling_common.py # BaseModelTest, TestTolerances
 
@@ -13,21 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 """Common test utilities for BioNeMo models.
 
 This package provides reusable test infrastructure following HuggingFace
 
@@ -13,21 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 """Shared test fixtures for BioNeMo models."""
 
 import os
@@ -63,7 +48,7 @@ def use_te_debug():
 
     os.environ["NVTE_DEBUG"] = "1"
     yield
-    del os.environ["NVTE_DEBUG"]
+    os.environ.pop("NVTE_DEBUG", None)
 
 
 ALL_RECIPES = [
@@ -138,6 +123,6 @@ def te_attn_backend(request):
 
     yield request.param
 
-    del os.environ["NVTE_FUSED_ATTN"]
-    del os.environ["NVTE_FLASH_ATTN"]
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
     _attention_backends["backend_selection_requires_update"] = True
@@ -31,9 +31,12 @@
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, set_seed
 
 
-HAS_DATA_CENTER_GPU = any(
-    gpu_name in torch.cuda.get_device_name(0).upper() for gpu_name in ["H100", "H200", "B100", "B200", "B300"]
-)
+try:
+    HAS_DATA_CENTER_GPU = torch.cuda.is_available() and any(
+        gpu_name in torch.cuda.get_device_name(0).upper() for gpu_name in ["H100", "H200", "B100", "B200", "B300"]
+    )
+except (RuntimeError, AssertionError):
+    HAS_DATA_CENTER_GPU = False
 
 
 @dataclass
@@ -343,13 +346,14 @@ def get_reference_model(
         model.to("cuda")
         return model
 
-    def get_reference_model_no_weights(self) -> PreTrainedModel:
+    def get_reference_model_no_weights(self, **kwargs) -> PreTrainedModel:
         """Load the reference HuggingFace model with random weights."""
         return self.get_upstream_model_class()(
             AutoConfig.from_pretrained(
                 self.get_upstream_model_id(),
                 dtype=torch.float32,
                 revision=self.get_upstream_model_revision(),
+                **kwargs,
             )
         )
 
 
@@ -154,8 +154,11 @@ def __call__(self, features, return_tensors=None):
             sequence processing capabilities. When pad_to_multiple_of is used, an additional
             mock sequence is appended to reach the desired total length.
         """
+        if return_tensors is not None and return_tensors != "pt":
+            raise NotImplementedError(f"Only return_tensors='pt' is supported, got '{return_tensors}'")
+
         # Perform the masking with the BSHD collator.
-        bshd_batch = self.collator(features)
+        bshd_batch = self.collator(features, return_tensors=return_tensors)
 
         # Create the flattened batch to get the cu_seq_lens_q and cu_seq_lens_k values.
         packed_batch = _pt_flatten_collate(features, return_position_ids=self.return_position_ids)
@@ -247,29 +250,68 @@ def __iter__(self):
         samples = []
         current_length = 0
         for sample in iter(self.dataset):
-            current_length += len(sample["input_ids"])
+            sample_length = len(sample["input_ids"])
+            current_length += sample_length
+
             if current_length == self.max_tokens_per_batch:
                 yield [*samples, sample]
                 samples = []
                 current_length = 0
 
             elif current_length > self.max_tokens_per_batch:
-                if not self.split_samples:
-                    # If we are not splitting samples, we can just yield the current batch (before this sample) and
-                    # start a new one.
-                    yield samples
-                    samples = [sample]
+                tokens_available = self.max_tokens_per_batch - (current_length - sample_length)
+
+                if tokens_available <= 0:
+                    # Current batch is already full (or over); yield it first, then handle this sample.
+                    if samples:
+                        yield samples
+                    samples = []
+                    current_length = sample_length
+                    tokens_available = self.max_tokens_per_batch
+
+                    # Now handle the incoming sample with a fresh batch.
+                    if sample_length == self.max_tokens_per_batch:
+                        yield [sample]
+                        samples = []
+                        current_length = 0
+                        continue
+                    elif sample_length < self.max_tokens_per_batch:
+                        samples = [sample]
+                        continue
+                    # sample_length > max_tokens_per_batch: fall through to split logic below
 
+                if not self.split_samples:
+                    # Yield the current batch (before this sample) and start a new one with this sample.
+                    if samples:
+                        yield samples
+                    # The sample itself may exceed max_tokens_per_batch; yield it as its own batch.
+                    if sample_length > self.max_tokens_per_batch:
+                        yield [sample]
+                        samples = []
+                        current_length = 0
+                    else:
+                        samples = [sample]
+                        current_length = sample_length
                 else:
-                    # Calculate how many tokens are already in the batch
-                    tokens_in_batch = current_length - len(sample["input_ids"])
-                    # Calculate how many tokens we can fit from this sample
-                    tokens_available = self.max_tokens_per_batch - tokens_in_batch
-                    first_part, remaining_part = _split_sample_by_num_tokens(sample, tokens_available)
-                    yield [*samples, first_part]
-                    samples = [remaining_part]
-
-                current_length = len(samples[0]["input_ids"])
+                    # Split mode: fill the current batch, then split remaining into chunks.
+                    if tokens_available > 0 and tokens_available < sample_length:
+                        first_part, remaining = _split_sample_by_num_tokens(sample, tokens_available)
+                        yield [*samples, first_part]
+                    else:
+                        # tokens_available >= sample_length shouldn't happen here, but guard anyway
+                        if samples:
+                            yield samples
+                        remaining = sample
+
+                    # Now split the remaining part into chunks of max_tokens_per_batch.
+                    while len(remaining["input_ids"]) > self.max_tokens_per_batch:
+                        chunk, remaining = _split_sample_by_num_tokens(remaining, self.max_tokens_per_batch)
+                        yield [chunk]
+
+                    samples = [remaining]
+                    current_length = len(remaining["input_ids"])
+                    continue
+
             else:
                 samples.append(sample)
 
@@ -345,7 +387,8 @@ def __call__(self, features) -> list[dict[str, Any]]:
             else:
                 raise ValueError(f"Unsupported qvk_format: {self.qkv_format}!")
 
-            batch_shard["max_length_k"] = batch_shard["max_length_q"] = max_length * round(max_length / 64)
+            padded_max = ((max_length + 63) // 64) * 64
+            batch_shard["max_length_k"] = batch_shard["max_length_q"] = padded_max
             combined_batch.append(batch_shard)
 
         return combined_batch