Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,20 @@ def default(self, o): # noqa : C901
return type(o).__name__


MODEL_CONFIG_CREDENTIAL_FIELDS = {
"api_key",
"inference_server_auth",
}


def _redact_model_config_credentials(model_config: dict) -> dict:
model_config_dict = dict(model_config)
for field in MODEL_CONFIG_CREDENTIAL_FIELDS:
if field in model_config_dict and model_config_dict[field] is not None:
model_config_dict[field] = "REDACTED"
return model_config_dict


class EvaluationTracker:
"""Tracks and manages evaluation results, metrics, and logging for model evaluations.

Expand Down Expand Up @@ -211,7 +225,7 @@ def __init__(
@property
def results(self):
config_general = asdict(self.general_config_logger)
config_general["model_config"] = config_general["model_config"].model_dump()
config_general["model_config"] = _redact_model_config_credentials(config_general["model_config"].model_dump())
results = {
"config_general": config_general,
"results": self.metrics_logger.metric_aggregated,
Expand Down Expand Up @@ -376,6 +390,9 @@ def generate_final_dict(self) -> dict:
"summary_tasks": self.details_logger.compiled_details,
"summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
}
to_dump["config_general"]["model_config"] = _redact_model_config_credentials(
to_dump["config_general"]["model_config"].model_dump()
)

final_dict = {
k: {eval_name.replace("|", ":"): eval_score for eval_name, eval_score in v.items()}
Expand Down
74 changes: 74 additions & 0 deletions tests/unit/logging/test_evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.logging.info_loggers import DetailsLogger
from lighteval.models.endpoints.litellm_model import LiteLLMModelConfig
from lighteval.models.endpoints.tgi_model import TGIModelConfig
from lighteval.pipeline import Pipeline

# ruff: noqa
from tests.fixtures import TESTING_EMPTY_HF_ORG_ID
Expand Down Expand Up @@ -128,6 +131,77 @@ def test_results_logging_template(self, mock_evaluation_tracker: EvaluationTrack
assert saved_results["results"] == task_metrics
assert saved_results["config_general"]["model_name"] == "test_model"

def test_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker):
mock_evaluation_tracker.general_config_logger.log_model_info(
LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key")
)

results = mock_evaluation_tracker.results

assert results["config_general"]["model_config"]["api_key"] == "REDACTED"

mock_evaluation_tracker.save()

results_dir = Path(mock_evaluation_tracker.output_dir) / "results" / "test_model"
result_files = list(results_dir.glob("results_*.json"))
assert len(result_files) == 1

with open(result_files[0], "r") as f:
saved_results = json.load(f)

assert saved_results["config_general"]["model_config"]["api_key"] == "REDACTED"
assert saved_results["config_general"]["model_config"]["model_name"] == "test_model"

def test_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker):
mock_evaluation_tracker.general_config_logger.log_model_info(
TGIModelConfig(
model_name="test_model",
inference_server_address="http://localhost:8080",
inference_server_auth="super-secret-token",
)
)

results = mock_evaluation_tracker.results

assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED"
assert results["config_general"]["model_config"]["model_name"] == "test_model"

def test_pipeline_get_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker):
mock_evaluation_tracker.general_config_logger.log_model_info(
LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key")
)

pipeline = Pipeline.__new__(Pipeline)
pipeline.accelerator = None
pipeline.parallel_context = None
pipeline.final_dict = None
pipeline.evaluation_tracker = mock_evaluation_tracker

results = pipeline.get_results()

assert results["config_general"]["model_config"]["api_key"] == "REDACTED"
assert results["config_general"]["model_config"]["model_name"] == "test_model"

def test_pipeline_get_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker):
mock_evaluation_tracker.general_config_logger.log_model_info(
TGIModelConfig(
model_name="test_model",
inference_server_address="http://localhost:8080",
inference_server_auth="super-secret-token",
)
)

pipeline = Pipeline.__new__(Pipeline)
pipeline.accelerator = None
pipeline.parallel_context = None
pipeline.final_dict = None
pipeline.evaluation_tracker = mock_evaluation_tracker

results = pipeline.get_results()

assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED"
assert results["config_general"]["model_config"]["model_name"] == "test_model"

@pytest.mark.evaluation_tracker(save_details=True)
def test_details_logging(self, mock_evaluation_tracker, mock_datetime):
task_details = {
Expand Down