Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 130 additions & 6 deletions bionemo-recipes/models/esm2/tests/common/test_modeling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Dict, List, Literal, Type
from typing import Any, Callable, Dict, List, Literal, Type

import pytest
import torch
Expand Down Expand Up @@ -82,6 +82,10 @@ class BaseModelTest(ABC):
Subclasses must implement all abstract methods to provide model-specific
configuration, data preparation, and conversion functions.

Set ``is_autoregressive = True`` in subclasses for causal LM models to
enable generation / KV-cache smoke tests. Non-autoregressive models
(e.g. ESM2) leave the default ``False`` and those tests are skipped.

Example:
```python
class ESM2ModelTester(BioNeMoModelTester):
Expand All @@ -98,6 +102,8 @@ def get_upstream_model_id(self):
```
"""

is_autoregressive: bool = False

@abstractmethod
def get_model_class(self) -> Type[PreTrainedModel]:
"""Return the TransformerEngine model class to test.
Expand Down Expand Up @@ -885,13 +891,15 @@ def test_fp8_forward_and_backward_pass(self, fp8_recipe, input_format):
msg=lambda x: f"FP8 loss differs too much from BF16 loss: {x}",
)

def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_format):
def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_format, **config_kwargs):
"""Test that model initialized with FP8 works correctly."""
if input_format == "thd" and not HAS_DATA_CENTER_GPU:
pytest.xfail("Padded sequences are not supported on non-datacenter hardware for THD.")

model_class = self.get_model_class()
config = self.create_test_config(attn_input_format=input_format, self_attn_mask_type="padding_causal")
config = self.create_test_config(
attn_input_format=input_format, self_attn_mask_type="padding_causal", **config_kwargs
)

# Initialize with FP8
with transformer_engine.pytorch.quantized_model_init(recipe=fp8_recipe):
Expand All @@ -906,9 +914,8 @@ def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_forma
input_data["labels"] = input_data["input_ids"].clone()

# Forward and backward pass with FP8
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
with transformer_engine.pytorch.autocast(recipe=fp8_recipe):
outputs = model(**input_data)
with transformer_engine.pytorch.autocast(recipe=fp8_recipe):
outputs = model(**input_data)

loss = outputs.loss
assert torch.isfinite(loss)
Expand Down Expand Up @@ -979,4 +986,121 @@ def test_meta_fp8_init(self, fp8_recipe):
model.init_empty_weights()
self.verify_model_parameters_initialized_correctly(model, should_be_fp8=True)

# ==================== Generation Tests (Autoregressive Models Only) ====================
@abstractmethod
def create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1) -> Any:
"""Create inference params for KV-cache generation tests.

Autoregressive model tests must override this method to provide
model-specific ``HFInferenceParams`` with allocated KV-cache memory.

Args:
config: Model configuration.
batch_size: Batch size.
max_seq_len: Maximum sequence length.
num_beams: Number of beams for beam search.

Returns:
HFInferenceParams instance with allocated memory.
"""
pass

def test_generate_without_cache(self):
"""Test basic generation without KV-cache (BSHD, use_cache=False)."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="bshd", self_attn_mask_type="causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompt = "The quick brown fox jumps over"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=False)

assert output_ids.shape[1] > inputs["input_ids"].shape[1]

def test_generate_with_cache(self):
"""Test single-prompt generation with KV-cache (THD format)."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompt = "The quick brown fox jumps over"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

past_key_values = self.create_inference_params(config, batch_size=1)

with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)

assert output_ids.shape[1] > inputs["input_ids"].shape[1]

def test_generate_with_cache_batched(self):
"""Test batched generation with KV-cache (left-padded BSHD converted to THD)."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompts = (
"The quick brown fox jumps over the lazy dog.",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
)
inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

past_key_values = self.create_inference_params(config, batch_size=2)

with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)

assert output_ids.shape[0] == 2
assert output_ids.shape[1] > inputs["input_ids"].shape[1]

def test_generate_with_cache_beam_search(self):
"""Test batched generation with KV-cache and beam search."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompts = (
"The quick brown fox jumps over the lazy dog.",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
)
inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

num_beams = 2
past_key_values = self.create_inference_params(config, batch_size=2, num_beams=num_beams)

with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=16,
use_cache=True,
past_key_values=past_key_values,
num_beams=num_beams,
do_sample=True,
)

assert output_ids.shape[0] == 2
assert output_ids.shape[1] > inputs["input_ids"].shape[1]

# TODO: add multi-GPU tests, e.g., meta-device init after fully_shard, cp tests, etc.
136 changes: 130 additions & 6 deletions bionemo-recipes/models/llama3/tests/common/test_modeling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Dict, List, Literal, Type
from typing import Any, Callable, Dict, List, Literal, Type

import pytest
import torch
Expand Down Expand Up @@ -82,6 +82,10 @@ class BaseModelTest(ABC):
Subclasses must implement all abstract methods to provide model-specific
configuration, data preparation, and conversion functions.

Set ``is_autoregressive = True`` in subclasses for causal LM models to
enable generation / KV-cache smoke tests. Non-autoregressive models
(e.g. ESM2) leave the default ``False`` and those tests are skipped.

Example:
```python
class ESM2ModelTester(BioNeMoModelTester):
Expand All @@ -98,6 +102,8 @@ def get_upstream_model_id(self):
```
"""

is_autoregressive: bool = False

@abstractmethod
def get_model_class(self) -> Type[PreTrainedModel]:
"""Return the TransformerEngine model class to test.
Expand Down Expand Up @@ -885,13 +891,15 @@ def test_fp8_forward_and_backward_pass(self, fp8_recipe, input_format):
msg=lambda x: f"FP8 loss differs too much from BF16 loss: {x}",
)

def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_format):
def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_format, **config_kwargs):
"""Test that model initialized with FP8 works correctly."""
if input_format == "thd" and not HAS_DATA_CENTER_GPU:
pytest.xfail("Padded sequences are not supported on non-datacenter hardware for THD.")

model_class = self.get_model_class()
config = self.create_test_config(attn_input_format=input_format, self_attn_mask_type="padding_causal")
config = self.create_test_config(
attn_input_format=input_format, self_attn_mask_type="padding_causal", **config_kwargs
)

# Initialize with FP8
with transformer_engine.pytorch.quantized_model_init(recipe=fp8_recipe):
Expand All @@ -906,9 +914,8 @@ def test_quantized_model_init_forward_and_backward(self, fp8_recipe, input_forma
input_data["labels"] = input_data["input_ids"].clone()

# Forward and backward pass with FP8
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
with transformer_engine.pytorch.autocast(recipe=fp8_recipe):
outputs = model(**input_data)
with transformer_engine.pytorch.autocast(recipe=fp8_recipe):
outputs = model(**input_data)

loss = outputs.loss
assert torch.isfinite(loss)
Expand Down Expand Up @@ -979,4 +986,121 @@ def test_meta_fp8_init(self, fp8_recipe):
model.init_empty_weights()
self.verify_model_parameters_initialized_correctly(model, should_be_fp8=True)

# ==================== Generation Tests (Autoregressive Models Only) ====================
@abstractmethod
def create_inference_params(self, config, batch_size=1, max_seq_len=256, num_beams=1) -> Any:
"""Create inference params for KV-cache generation tests.

Autoregressive model tests must override this method to provide
model-specific ``HFInferenceParams`` with allocated KV-cache memory.

Args:
config: Model configuration.
batch_size: Batch size.
max_seq_len: Maximum sequence length.
num_beams: Number of beams for beam search.

Returns:
HFInferenceParams instance with allocated memory.
"""
pass

def test_generate_without_cache(self):
"""Test basic generation without KV-cache (BSHD, use_cache=False)."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="bshd", self_attn_mask_type="causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompt = "The quick brown fox jumps over"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=False)

assert output_ids.shape[1] > inputs["input_ids"].shape[1]

def test_generate_with_cache(self):
"""Test single-prompt generation with KV-cache (THD format)."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompt = "The quick brown fox jumps over"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

past_key_values = self.create_inference_params(config, batch_size=1)

with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)

assert output_ids.shape[1] > inputs["input_ids"].shape[1]

def test_generate_with_cache_batched(self):
"""Test batched generation with KV-cache (left-padded BSHD converted to THD)."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompts = (
"The quick brown fox jumps over the lazy dog.",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
)
inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

past_key_values = self.create_inference_params(config, batch_size=2)

with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=16, use_cache=True, past_key_values=past_key_values)

assert output_ids.shape[0] == 2
assert output_ids.shape[1] > inputs["input_ids"].shape[1]

def test_generate_with_cache_beam_search(self):
"""Test batched generation with KV-cache and beam search."""
if not self.is_autoregressive:
pytest.skip("Not an autoregressive model")

config = self.create_test_config(attn_input_format="thd", self_attn_mask_type="padding_causal")
model = self.get_model_class()(config).to("cuda").to(torch.bfloat16)
model.eval()

tokenizer = self.get_tokenizer()
prompts = (
"The quick brown fox jumps over the lazy dog.",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
)
inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left")
inputs = {k: v.to("cuda") for k, v in inputs.items()}

num_beams = 2
past_key_values = self.create_inference_params(config, batch_size=2, num_beams=num_beams)

with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=16,
use_cache=True,
past_key_values=past_key_values,
num_beams=num_beams,
do_sample=True,
)

assert output_ids.shape[0] == 2
assert output_ids.shape[1] > inputs["input_ids"].shape[1]

# TODO: add multi-GPU tests, e.g., meta-device init after fully_shard, cp tests, etc.
Loading
Loading