NVIDIA
diff --git a/‎CLAUDE.md‎
Lines changed: 3 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bionemo-recipes/models/amplify/src/amplify/state.py‎
Lines changed: 15 additions & 1 deletion b/‎bionemo-recipes/models/amplify/src/amplify/state.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎bionemo-recipes/models/esm2/README.md‎
Lines changed: 1 addition & 1 deletion b/‎bionemo-recipes/models/esm2/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes/models/esm2/collator.py‎
Lines changed: 149 additions & 101 deletions b/‎bionemo-recipes/models/esm2/collator.py‎
Lines changed: 149 additions & 101 deletions
@@ -42,6 +42,9 @@ pre-commit run --all-files
 pre-commit run
 ```
 
+Do not copy license headers from other files, instead allow the license-check.py script to add the license header during
+pre-commit to ensure the proper year is used.
+
 Pre-commit includes:
 
 - Ruff linting/formatting (line-length: 119, Google-style docstrings)
 
@@ -13,7 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""State dict conversion utilities adapted from nemo.lightning.io.state."""
+"""State dict conversion utilities adapted from nemo.lightning.io.state.
+
+This module provides the transform system used by convert.py to map state dicts between model formats:
+
+- ``mapping``: A dict of simple key renames (source_key -> target_key). Each source key is copied directly
+  to the corresponding target key with no modification to the tensor values.
+
+- ``transforms``: A list of ``StateDictTransform`` objects for multi-key merges and splits. These handle
+  cases where multiple source keys must be combined into one target key (e.g., merging Q/K/V into fused QKV),
+  or one source key must be split into multiple target keys.
+
+  Important: When ``source_key`` is a tuple (many-to-one merge), the transform function's parameter names
+  are used to map each source key to a function argument. This means ``*args`` style parameters do not work;
+  each parameter must be explicitly named (e.g., ``def fn(q, k, v)`` not ``def fn(*args)``).
+"""
 
 import inspect
 import logging
 
@@ -115,7 +115,7 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
 
 To validate the converted models, refer to the commands in [Inference Examples](#inference-examples) above to load and test both the original and converted
 models to ensure loss and logit values are similar. Additionally, refer to the golden value tests in
-[test_modeling_esm_te.py](tests/test_modeling_esm_te.py) and [test_convert.py](tests/test_convert.py).
+[test_modeling_esm_te.py](tests/test_modeling_esm_te.py) and [test_export.py](tests/test_export.py).
 
 ## Developer Guide
 
 
@@ -730,6 +730,144 @@ def _pt_flatten_collate(features: list[dict[str, list[int]]], return_position_id
     return batch
 
 
+def _find_seq_dim(tensor: torch.Tensor, seq_len: int) -> int:
+    """Find which dimension of tensor matches the expected sequence length.
+
+    Args:
+        tensor: The tensor to inspect.
+        seq_len: The expected sequence length to match against tensor dimensions.
+
+    Returns:
+        The dimension index that matches the sequence length.
+
+    Raises:
+        ValueError: If no dimension matches the expected sequence length.
+    """
+    if tensor.ndim == 1:
+        if tensor.shape[0] == seq_len:
+            return 0
+        raise ValueError(f"1D tensor shape {tensor.shape} doesn't match sequence length {seq_len}")
+    elif tensor.ndim >= 2:
+        if tensor.shape[1] == seq_len:
+            return 1
+        elif tensor.shape[0] == seq_len:
+            return 0
+        raise ValueError(f"Tensor shape {tensor.shape} doesn't match sequence length {seq_len} in dim 0 or 1")
+    raise ValueError(f"Unexpected tensor ndim={tensor.ndim}")
+
+
+def _process_tensor_thd(
+    val: torch.Tensor | None,
+    seq_len: int,
+    slice_sizes: torch.Tensor,
+    cu_seqlens_padded: torch.Tensor,
+    cp_rank: int,
+    total_slices: int,
+) -> torch.Tensor | None:
+    """Extract the THD context-parallel shard for a single tensor.
+
+    For each sequence in the batch, selects two slices (one from the beginning and one from the end)
+    corresponding to the given CP rank, following the zigzag CP sharding pattern.
+
+    Args:
+        val: The tensor to shard, or None (returned as-is).
+        seq_len: Total sequence length (from cu_seqlens_padded[-1]).
+        slice_sizes: Per-sequence slice sizes, computed as sequence_lengths // total_slices.
+        cu_seqlens_padded: Cumulative sequence lengths including padding.
+        cp_rank: The context parallelism rank index.
+        total_slices: Total number of slices per sequence (2 * cp_world_size).
+
+    Returns:
+        The sharded tensor for the given CP rank, or None if val is None.
+    """
+    if val is None:
+        return val
+
+    seq_dim = _find_seq_dim(val, seq_len)
+
+    cp_rank_slices = []
+    for slice_size, seq_start in zip(slice_sizes, cu_seqlens_padded[:-1]):
+        # 1st segment
+        cp_rank_slices.append(
+            torch.arange(
+                seq_start + (cp_rank * slice_size),
+                seq_start + ((cp_rank + 1) * slice_size),
+                device=val.device,
+            )
+        )
+
+        # 2nd segment
+        cp_rank_slices.append(
+            torch.arange(
+                seq_start + ((total_slices - cp_rank - 1) * slice_size),
+                seq_start + ((total_slices - cp_rank) * slice_size),
+                device=val.device,
+            )
+        )
+
+    return val.index_select(seq_dim, torch.cat(cp_rank_slices))
+
+
+def _process_tensor_bshd(
+    val: torch.Tensor | None,
+    cp_rank: int,
+    cp_world_size: int,
+) -> torch.Tensor | None:
+    """Extract the BSHD context-parallel shard for a single tensor.
+
+    Splits a BSHD-format tensor along the sequence dimension (dim=1) into 2*cp_world_size chunks,
+    then selects the two chunks corresponding to the given CP rank (zigzag pattern).
+
+    Args:
+        val: The tensor to shard, or None (returned as-is).
+        cp_rank: The context parallelism rank index.
+        cp_world_size: Total number of context parallelism ranks.
+
+    Returns:
+        The sharded tensor for the given CP rank, or None if val is None.
+
+    Raises:
+        ValueError: If the tensor has fewer than 2 dimensions or its sequence length
+            is not divisible by 2 * cp_world_size.
+    """
+    if val is None:
+        return val
+
+    if val.ndim < 2:
+        raise ValueError(f"BSHD format requires at least 2D tensors, got {val.ndim}D")
+
+    seq_len = val.shape[1]
+
+    # Calculate chunk size
+    total_chunks = 2 * cp_world_size
+    chunk_size = seq_len // total_chunks
+
+    if seq_len % total_chunks != 0:
+        raise ValueError(
+            f"Sequence length {seq_len} must be divisible by {total_chunks} "
+            f"(2 * cp_world_size) for BSHD context parallelism"
+        )
+
+    # Determine which chunks this rank should get
+    # Rank 0 gets chunks [0, total_chunks-1]
+    # Rank 1 gets chunks [1, total_chunks-2]
+    # Rank k gets chunks [k, total_chunks-k-1]
+    chunk_indices = [cp_rank, total_chunks - cp_rank - 1]
+
+    # Collect slices for this rank
+    rank_slices = []
+    for chunk_idx in chunk_indices:
+        start_idx = chunk_idx * chunk_size
+        end_idx = start_idx + chunk_size
+        rank_slices.append(torch.arange(start_idx, end_idx, device=val.device))
+
+    # Concatenate indices for all chunks this rank should get
+    indices = torch.cat(rank_slices)
+
+    # Select along sequence dimension (dim=1)
+    return val.index_select(1, indices)
+
+
 def _pt_pad_to_multiple_of(batch: dict[str, Any], pad_to_multiple_of: int, token_pad: int, label_pad: int):
     """Pad a batch to a multiple of pad_to_multiple_of.
 
@@ -837,110 +975,20 @@ def _split_batch_by_cp_rank(
         total_slices_of_any_sequence = 2 * cp_world_size
         slice_sizes = (cu_seqlens_padded[1:] - cu_seqlens_padded[:-1]) // total_slices_of_any_sequence
 
-        # Process each tensor directly instead of using keys_to_change loop
-        def process_tensor(val):
-            if val is None:
-                return val
-            # Determine which dimension is the sequence dimension
-            # Ensure cu_seqlens_padded[-1] is a Python int, not a 0-dim tensor
-            if isinstance(cu_seqlens_padded[-1], torch.Tensor):
-                seq_len_val = cu_seqlens_padded[-1].item()
-            else:
-                seq_len_val = cu_seqlens_padded[-1]
-
-            # Handle 1D tensors (like position_ids that don't have batch dimension)
-            if val.ndim == 1:
-                if val.shape[0] == seq_len_val:
-                    current_seq_dim = 0
-                else:
-                    raise ValueError(
-                        "1D tensor shape doesn't match expected sequence length. Make sure the"
-                        " inputs are in THD format and padded correctly."
-                    )
-            elif val.ndim >= 2:
-                if val.shape[1] == seq_len_val:
-                    current_seq_dim = 1
-                elif val.shape[0] == seq_len_val:
-                    current_seq_dim = 0
-                else:
-                    raise ValueError("Make sure the inputs are in THD format and padded correctly.")
-            else:
-                raise ValueError("Tensor must be at least 1D")
-
-            # On this particular rank, for each sequence, get two slices, one from the beginning
-            # and one from the end.
-            cp_rank_slices = []
-            for slice_size, seq_start in zip(slice_sizes, cu_seqlens_padded[:-1]):
-                # 1st segment
-                cp_rank_slices.append(
-                    torch.arange(
-                        seq_start + (cp_rank * slice_size),
-                        seq_start + ((cp_rank + 1) * slice_size),
-                        device=val.device,
-                    )
-                )
+        # Ensure cu_seqlens_padded[-1] is a Python int, not a 0-dim tensor
+        last_elem = cu_seqlens_padded[-1]
+        seq_len_val = last_elem.item() if isinstance(last_elem, torch.Tensor) else last_elem
 
-                # 2nd segment
-                cp_rank_slices.append(
-                    torch.arange(
-                        seq_start + ((total_slices_of_any_sequence - cp_rank - 1) * slice_size),
-                        seq_start + ((total_slices_of_any_sequence - cp_rank) * slice_size),
-                        device=val.device,
-                    )
-                )
-
-            return val.index_select(current_seq_dim, torch.cat(cp_rank_slices))
-
-        # Process each tensor directly
-        input_ids_padded = process_tensor(input_ids_padded)
-        labels_padded = process_tensor(labels_padded)
+        input_ids_padded = _process_tensor_thd(
+            input_ids_padded, seq_len_val, slice_sizes, cu_seqlens_padded, cp_rank, total_slices_of_any_sequence
+        )
+        labels_padded = _process_tensor_thd(
+            labels_padded, seq_len_val, slice_sizes, cu_seqlens_padded, cp_rank, total_slices_of_any_sequence
+        )
 
     elif qvk_format == "bshd":
-        # BSHD format: [batch, seq_len, ...]
-        # Split along sequence dimension (dim=1)
-        # Each sequence is split into 2*cp_world_size chunks
-        # Each rank gets chunks at positions: [cp_rank, 2*cp_world_size - cp_rank - 1]
-
-        def process_tensor_bshd(val):
-            if val is None:
-                return val
-
-            if val.ndim < 2:
-                raise ValueError(f"BSHD format requires at least 2D tensors, got {val.ndim}D")
-
-            seq_len = val.shape[1]
-
-            # Calculate chunk size
-            total_chunks = 2 * cp_world_size
-            chunk_size = seq_len // total_chunks
-
-            if chunk_size == 0:
-                raise ValueError(
-                    f"Sequence length {seq_len} must be divisible by {total_chunks} "
-                    f"(2 * cp_world_size) for BSHD context parallelism"
-                )
-
-            # Determine which chunks this rank should get
-            # Rank 0 gets chunks [0, total_chunks-1]
-            # Rank 1 gets chunks [1, total_chunks-2]
-            # Rank k gets chunks [k, total_chunks-k-1]
-            chunk_indices = [cp_rank, total_chunks - cp_rank - 1]
-
-            # Collect slices for this rank
-            rank_slices = []
-            for chunk_idx in chunk_indices:
-                start_idx = chunk_idx * chunk_size
-                end_idx = start_idx + chunk_size
-                rank_slices.append(torch.arange(start_idx, end_idx, device=val.device))
-
-            # Concatenate indices for all chunks this rank should get
-            indices = torch.cat(rank_slices)
-
-            # Select along sequence dimension (dim=1)
-            return val.index_select(1, indices)
-
-        input_ids_padded = process_tensor_bshd(input_ids_padded)
-        labels_padded = process_tensor_bshd(labels_padded)
+        input_ids_padded = _process_tensor_bshd(input_ids_padded, cp_rank, cp_world_size)
+        labels_padded = _process_tensor_bshd(labels_padded, cp_rank, cp_world_size)
 
     else:
         raise ValueError(f"Support not implemented yet for qvk_format: {qvk_format}!")