diff --git a/judgearena/arenas_utils.py b/judgearena/arenas_utils.py index ab61a2b..e85750f 100644 --- a/judgearena/arenas_utils.py +++ b/judgearena/arenas_utils.py @@ -5,6 +5,10 @@ from fast_langdetect import detect_language from huggingface_hub import snapshot_download +from judgearena.log import get_logger + +logger = get_logger(__name__) + def _extract_instruction_text(turn: dict) -> str: """Extract plain instruction text from a conversation first turn. @@ -157,8 +161,11 @@ def get_winner( df = df.loc[df.turns == 1] n_dropped = n_before - len(df) if n_dropped > 0: - print( - f"[{arena}] Dropped {n_dropped}/{n_before} multi-turn battles (keeping single-turn only)." + logger.info( + "[%s] Dropped %d/%d multi-turn battles (keeping single-turn only).", + arena, + n_dropped, + n_before, ) return df @@ -189,13 +196,17 @@ def load_arena_dataframe( def main(): for arena in KNOWN_ARENAS: - print(f"Loading {arena}") + logger.info("Loading %s", arena) df = _load_arena_dataframe(arena) n_battles = len(df) n_models = len(set(df["model_a"]) | set(df["model_b"])) n_languages = df["lang"].nunique() - print( - f"{arena}: {n_battles} battles, {n_models} models, {n_languages} languages" + logger.info( + "%s: %d battles, %d models, %d languages", + arena, + n_battles, + n_models, + n_languages, ) diff --git a/judgearena/cli_common.py b/judgearena/cli_common.py index cbaa696..58ce78b 100644 --- a/judgearena/cli_common.py +++ b/judgearena/cli_common.py @@ -29,6 +29,9 @@ class BaseCliArgs: chat_template: str | None = None result_folder: str = "results" engine_kwargs: dict = field(default_factory=dict) + verbosity: int = 0 + log_file: str | None = None + no_log_file: bool = False def __post_init__(self): supported_modes = ["fixed", "both"] @@ -157,6 +160,38 @@ def add_common_arguments(parser: argparse.ArgumentParser) -> None: '\'{"tensor_parallel_size": 2, "gpu_memory_utilization": 0.9}\'.' ), ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Increase logging verbosity. Use -v for DEBUG output.", + ) + parser.add_argument( + "-q", + "--quiet", + action="store_true", + default=False, + help="Suppress all output except warnings and errors.", + ) + parser.add_argument( + "--log-file", + dest="log_file", + type=str, + default=None, + help=( + "Write the full DEBUG log to this file in addition to the " + "console output. By default a timestamped run-*.log is saved " + "automatically in the result folder." + ), + ) + parser.add_argument( + "--no-log-file", + dest="no_log_file", + action="store_true", + default=False, + help="Disable automatic file logging in the result folder.", + ) def parse_engine_kwargs(raw: str) -> dict: @@ -168,3 +203,13 @@ def parse_engine_kwargs(raw: str) -> dict: except Exception as e: raise SystemExit(f"Failed to parse --engine_kwargs: {e}") from e return engine_kwargs + + +def resolve_verbosity(args: argparse.Namespace) -> int: + """Derive a single verbosity int from ``-v`` / ``-q`` flags. + + Returns ``-1`` for quiet, ``0`` for default (INFO), ``1+`` for verbose. + """ + if getattr(args, "quiet", False): + return -1 + return getattr(args, "verbose", 0) diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index d7dfbd7..1ff726e 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -8,11 +8,19 @@ from sklearn.linear_model import LogisticRegression from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe -from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs +from judgearena.cli_common import ( + BaseCliArgs, + add_common_arguments, + parse_engine_kwargs, + resolve_verbosity, +) from judgearena.evaluate import judge_and_parse_prefs from judgearena.generate import generate_instructions +from judgearena.log import configure_logging, get_logger from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model +logger = get_logger(__name__) + @dataclass class CliEloArgs(BaseCliArgs): @@ -106,6 +114,9 @@ def parse_args(cls): chat_template=args.chat_template, result_folder=args.result_folder, engine_kwargs=parse_engine_kwargs(args.engine_kwargs), + verbosity=resolve_verbosity(args), + log_file=args.log_file, + no_log_file=args.no_log_file, ) @@ -228,7 +239,7 @@ def main(args: CliEloArgs | None = None) -> dict: rng = np.random.default_rng(args.seed) # Step 1: Load arena battles - print(f"\n=== Step 1: Loading battles from {args.arena} ===") + logger.info("Step 1: Loading battles from %s", args.arena) df_arena_all = load_arena_dataframe(arena=args.arena) # Filter by language if specified @@ -250,7 +261,7 @@ def main(args: CliEloArgs | None = None) -> dict: df_battles = df_battles.reset_index(drop=True) n = len(df_battles) - print(f"Loaded {n} battles.") + logger.info("Loaded %d battles.", n) # Extract user instructions (first turn of conversation_a) instructions = pd.Series( @@ -260,10 +271,10 @@ def main(args: CliEloArgs | None = None) -> dict: ], name="instruction", ) - print(f"\nFirst instruction:\n{instructions.iloc[0][:300]}\n") + logger.debug("First instruction:\n%s", instructions.iloc[0][:300]) # Step 2: Generate completions for the model under evaluation - print(f"=== Step 2: Generating completions with {args.model} ===") + logger.info("Step 2: Generating completions with %s", args.model) # Only pass extra engine kwargs that are not None extra_kwargs = dict(args.engine_kwargs) @@ -297,8 +308,11 @@ def replace_slash(s: str) -> str: ) if len(cache_suffix) > 100: cache_hash = hashlib.sha256(cache_suffix.encode()).hexdigest()[:16] - print( - f"Cache suffix too long ({len(cache_suffix)} chars), using hash: {cache_hash} (full: {cache_suffix})" + logger.debug( + "Cache suffix too long (%d chars), using hash: %s (full: %s)", + len(cache_suffix), + cache_hash, + cache_suffix, ) cache_suffix = cache_hash completions_df = cache_function_dataframe( @@ -308,10 +322,10 @@ def replace_slash(s: str) -> str: ).set_index("instruction_index") completions = completions_df.loc[:, "completion"] - print(f"First completion:\n{completions.iloc[0]}\n") + logger.debug("First completion:\n%s", completions.iloc[0]) # Step 3: Judge evaluation against randomly picked arena opponents - print(f"=== Step 3: Judge evaluation with {args.judge_model} ===") + logger.info("Step 3: Judge evaluation with %s", args.judge_model) # For each battle, randomly pick opponent: model_a or model_b from the arena use_model_a_as_opponent = rng.choice([True, False], size=n) @@ -390,7 +404,7 @@ def run_judge() -> pd.DataFrame: opponent_models = df_judge["opponent_model"].tolist() prefs = df_judge["pref"].tolist() - print(f"First judge output:\n{df_judge['judge_completion'].iloc[0][:500]}\n") + logger.debug("First judge output:\n%s", df_judge["judge_completion"].iloc[0][:500]) # Map preferences back to model-name-level battle results model_name = args.model @@ -499,7 +513,9 @@ def run_judge() -> pd.DataFrame: def cli(): - main() + args = CliEloArgs.parse_args() + configure_logging(args.verbosity, log_file=args.log_file) + main(args) if __name__ == "__main__": diff --git a/judgearena/eval_utils.py b/judgearena/eval_utils.py index e56e1e9..a083569 100644 --- a/judgearena/eval_utils.py +++ b/judgearena/eval_utils.py @@ -7,8 +7,11 @@ import pandas as pd from judgearena.evaluate import PairScore, annotate_battles +from judgearena.log import get_logger from judgearena.utils import compute_pref_summary +logger = get_logger(__name__) + def print_results(results): """Print battle results in a readable format.""" @@ -124,8 +127,10 @@ def _make_judge_annotation( combined_metadata = list(metadata) if swap_mode == "both": - print("Correction for judge bias towards a certain model position is set.") - print("Evaluating completions with models reversed.") + logger.info( + "Correction for judge bias towards a certain model position is set." + ) + logger.info("Evaluating completions with models reversed.") annotations_reversed = annotate_battles( judge_chat_model=judge_chat_model, instructions=instructions, diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py index de1c1c1..7eb8599 100644 --- a/judgearena/evaluate.py +++ b/judgearena/evaluate.py @@ -14,6 +14,7 @@ download_arena_hard, is_arena_hard_dataset, ) +from judgearena.log import get_logger from judgearena.repro import _to_jsonable, write_run_metadata from judgearena.utils import ( compute_pref_summary, @@ -24,6 +25,8 @@ truncate, ) +logger = get_logger(__name__) + class PairScore: def __init__(self): @@ -157,13 +160,13 @@ def evaluate_completions( def get_output(df_outputs: pd.DataFrame, dataset: str, method: str): if Path(method).exists(): - print(f"Path {method} exists, loads local model completions.") + logger.info("Path %s exists, loading local model completions.", method) df = read_df(Path(method)).set_index("instruction_index").sort_index() - print(f"Loaded {len(df)} completions.") + logger.info("Loaded %d completions.", len(df)) df.loc[:, "output"] = df.loc[:, "output"].fillna("") return df.loc[:, "output"] else: - print(f"Loading {method} from {dataset} dataset.") + logger.info("Loading %s from %s dataset.", method, dataset) assert method in df_outputs.columns, ( f"Method {method} not present, pick among {df_outputs.columns.tolist()}" ) @@ -186,7 +189,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str): unique_string = dataset + "-" + datetime.now().strftime("%Y%m%d_%H%M%S") output_folder = data_root / "judge-evals" / unique_string - print(f"Saving results in {output_folder}") + logger.info("Saving results in %s", output_folder) output_folder.mkdir(parents=True, exist_ok=True) ( judge_system_prompt, @@ -216,7 +219,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str): results = {**compute_pref_summary(prefs)} pd.DataFrame(annotations).to_csv(output_folder / "annotations.csv", index=False) - print(f"{method_A} against {method_B}:\n{results}") + logger.info("%s against %s:\n%s", method_A, method_B, results) with open(output_folder / "results.json", "w") as f: json.dump(_to_jsonable(results), f, allow_nan=False) @@ -248,7 +251,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str): started_at_utc=run_started_at, ) except OSError as e: - print(f"Warning: failed to write run metadata: {e}") + logger.warning("Failed to write run metadata: %s", e) @dataclass @@ -324,7 +327,7 @@ def annotate_battles( ) ] ) - print(f"Start LLM judge annotation ({len(inputs)} annotations).") + logger.info("Start LLM judge annotation (%d annotations).", len(inputs)) judge_completions = do_inference( chat_model=judge_chat_model, inputs=inputs, @@ -373,9 +376,12 @@ def judge_and_parse_prefs( already combined for swap_mode="both" """ if swap_mode == "both": - print("Correction for judge bias towards a certain model position is set.") - print( - f"Evaluating completions with models reversed with judge {judge_chat_model}." + logger.info( + "Correction for judge bias towards a certain model position is set." + ) + logger.info( + "Evaluating completions with models reversed with judge %s.", + judge_chat_model, ) annotations = annotate_battles( diff --git a/judgearena/generate_and_evaluate.py b/judgearena/generate_and_evaluate.py index ee2062e..7c70253 100644 --- a/judgearena/generate_and_evaluate.py +++ b/judgearena/generate_and_evaluate.py @@ -12,7 +12,12 @@ import pandas as pd -from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs +from judgearena.cli_common import ( + BaseCliArgs, + add_common_arguments, + parse_engine_kwargs, + resolve_verbosity, +) from judgearena.evaluate import judge_and_parse_prefs, resolve_judge_prompts from judgearena.generate import generate_base, generate_instructions from judgearena.instruction_dataset import load_instructions @@ -20,6 +25,12 @@ download_arena_hard, is_arena_hard_dataset, ) +from judgearena.log import ( + attach_file_handler, + configure_logging, + get_logger, + make_run_log_path, +) from judgearena.mt_bench.mt_bench_utils import run_mt_bench from judgearena.repro import _to_jsonable, write_run_metadata from judgearena.utils import ( @@ -31,6 +42,8 @@ read_df, ) +logger = get_logger(__name__) + def try_load_dataset_completions( dataset: str, model: str, n_instructions: int | None @@ -60,7 +73,9 @@ def try_load_dataset_completions( ).sort_index() if model not in df_outputs.columns: return None - print(f"Found pre-existing completions for '{model}' in dataset '{dataset}'.") + logger.info( + "Found pre-existing completions for '%s' in dataset '%s'.", model, dataset + ) completions = df_outputs.loc[:, model] if n_instructions is not None: completions = completions.head(n_instructions) @@ -127,6 +142,9 @@ def parse_args(cls): chat_template=args.chat_template, result_folder=args.result_folder, engine_kwargs=parse_engine_kwargs(args.engine_kwargs), + verbosity=resolve_verbosity(args), + log_file=args.log_file, + no_log_file=args.no_log_file, ) @@ -167,8 +185,23 @@ def main(args: CliArgs): """ run_started_at = datetime.now(UTC) - print( - f"Using dataset {args.dataset} and evaluating models {args.model_A} and {args.model_B}." + + # Build the result folder early so the file handler captures the entire run. + # Include a timestamp so each run gets its own unique directory. + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + name = name.replace("/", "_") + run_ts = run_started_at.strftime("%Y%m%d_%H%M%S") + res_folder = Path(args.result_folder) / f"{name}-{run_ts}" + res_folder.mkdir(parents=True, exist_ok=True) + if not args.no_log_file: + attach_file_handler(make_run_log_path(res_folder)) + + logger.info( + "Using dataset %s and evaluating models %s and %s.", + args.dataset, + args.model_A, + args.model_B, ) # Not working with vllm, not detecting model changes and serving the same cache for two different models... @@ -177,7 +210,12 @@ def main(args: CliArgs): ignore_cache = args.ignore_cache if args.dataset == "mt-bench": - return run_mt_bench(args, ignore_cache) + return run_mt_bench( + args, + ignore_cache, + res_folder=res_folder, + result_name=name, + ) # Currrently, we run context evaluation is_fluency_task = "fluency" in args.dataset @@ -195,9 +233,12 @@ def main(args: CliArgs): if args.n_instructions is not None: instructions = instructions[:n_instructions] - print( - f"Generating completions for dataset {args.dataset} with model {args.model_A} and " - f"{args.model_B} (or loading them directly if present)" + logger.info( + "Generating completions for dataset %s with model %s and %s " + "(or loading them directly if present)", + args.dataset, + args.model_A, + args.model_B, ) # TODO currently we just support base models for fluency, we could also support instruction-tuned models @@ -259,13 +300,10 @@ def main(args: CliArgs): cache_name=f"{args.dataset}_{args.model_B}_{args.n_instructions}", ).set_index("instruction_index") completions_B = completions_B.loc[:, "completion"] - print(f"\nFirst instruction/context: {instructions.values[0]}") - - print(f"\nFirst completion of {args.model_A}") - print(completions_A.values[0]) - print(f"\nFirst completion of {args.model_B}") - print(completions_B.values[0]) - print(f"Evaluating completions with judge {args.judge_model}.") + logger.debug("First instruction/context: %s", instructions.values[0]) + logger.debug("First completion of %s:\n%s", args.model_A, completions_A.values[0]) + logger.debug("First completion of %s:\n%s", args.model_B, completions_B.values[0]) + logger.info("Evaluating completions with judge %s.", args.judge_model) judge_chat_model = make_model( model=args.judge_model, @@ -275,18 +313,11 @@ def main(args: CliArgs): **args.engine_kwargs, ) - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" - name += f"-{args.swap_mode}" - name = name.replace("/", "_") - - res_folder = Path(args.result_folder) / name - res_folder.mkdir(parents=True, exist_ok=True) - # save argument for results analysis with open(res_folder / f"args-{name}.json", "w") as f: json.dump(asdict(args), f, indent=2) - print(f"Saving results to {res_folder}") + logger.info("Saving results to %s", res_folder) if is_fluency_task: system_prompt = """You are a highly efficient assistant, who evaluates and selects the best large language \ model based on the quality of completion of a sentence. You will see a sentence to be completed and two \ @@ -346,7 +377,7 @@ def main(args: CliArgs): **summary, "preferences": prefs.tolist(), } - print(f"{args.model_A} vs {args.model_B} judged by {args.judge_model}") + logger.info("%s vs %s judged by %s", args.model_A, args.model_B, args.judge_model) print_results(results) with open(res_folder / f"results-{name}.json", "w") as f: @@ -374,14 +405,15 @@ def main(args: CliArgs): started_at_utc=run_started_at, ) except OSError as e: - print(f"Warning: failed to write run metadata: {e}") + logger.warning("Failed to write run metadata: %s", e) return prefs def cli(): args = CliArgs.parse_args() - print(f"Running with CLI args: {args.__dict__}") + configure_logging(args.verbosity, log_file=args.log_file) + logger.debug("Running with CLI args: %s", args.__dict__) main(args) diff --git a/judgearena/instruction_dataset/__init__.py b/judgearena/instruction_dataset/__init__.py index 4681f0a..5f5d9f7 100644 --- a/judgearena/instruction_dataset/__init__.py +++ b/judgearena/instruction_dataset/__init__.py @@ -5,8 +5,11 @@ is_arena_hard_dataset, ) from judgearena.instruction_dataset.m_arenahard import load_m_arenahard +from judgearena.log import get_logger from judgearena.utils import data_root, download_hf, read_df +logger = get_logger(__name__) + def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame: if dataset == "mt-bench": @@ -47,7 +50,9 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat "zh", "EU", ] - print(f"Loading m-arena-hard with language specification set to {language}") + logger.info( + "Loading m-arena-hard with language specification set to %s", language + ) df_instructions = load_m_arenahard(local_path=data_root, language=language) # sort by question_id, then language so that we get multiple languages if we truncate @@ -75,7 +80,7 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat df_instructions = read_df(local_path_tables / "instructions" / f"{dataset}.csv") df_instructions = df_instructions.set_index("instruction_index").sort_index() - print(f"Loaded {len(df_instructions)} instructions for {dataset}.") + logger.info("Loaded %d instructions for %s.", len(df_instructions), dataset) if n_instructions is None: n_instructions = len(df_instructions) return df_instructions.head(n_instructions) diff --git a/judgearena/log.py b/judgearena/log.py new file mode 100644 index 0000000..a2a35c7 --- /dev/null +++ b/judgearena/log.py @@ -0,0 +1,156 @@ +"""Structured logging for judgearena. + +Provides a thin wrapper around Python's ``logging`` module so that every +module in the package can do:: + + from judgearena.log import get_logger + logger = get_logger(__name__) + logger.info("Loaded %d instructions", n) + +Call :func:`configure_logging` once from the CLI entrypoint to set the +verbosity level based on ``-v`` / ``-q`` flags. The default level is +``INFO``, which mirrors the current ``print()`` behaviour. +""" + +from __future__ import annotations + +import logging +import os +import sys +from datetime import datetime +from pathlib import Path + +_ROOT_LOGGER_NAME = "judgearena" + +_CONSOLE_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" +_FILE_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" +_CONSOLE_DATEFMT = "%H:%M:%S" +_FILE_DATEFMT = "%Y-%m-%d %H:%M:%S" + + +def _resolve_log_path(log_file: str | Path) -> Path: + return Path(log_file).expanduser().resolve() + + +def get_logger(name: str | None = None) -> logging.Logger: + """Return a logger under the ``judgearena`` namespace. + + Typical usage at the top of a module:: + + logger = get_logger(__name__) + + The returned logger inherits its level and handlers from the root + ``judgearena`` logger configured by :func:`configure_logging`. + """ + if name is None: + return logging.getLogger(_ROOT_LOGGER_NAME) + # Strip the common prefix so child loggers are short + # e.g. "judgearena.utils" → child of "judgearena" + if not name.startswith(_ROOT_LOGGER_NAME): + name = f"{_ROOT_LOGGER_NAME}.{name}" + return logging.getLogger(name) + + +def configure_logging( + verbosity: int = 0, + log_file: str | Path | None = None, +) -> None: + """Set up the root ``judgearena`` logger. + + Args: + verbosity: Controls the console log level. + + * ``-1`` (``-q``) → ``WARNING`` – only warnings and errors. + * ``0`` (default) → ``INFO`` – progress messages (mirrors + the old ``print()`` behaviour). + * ``1+`` (``-v``) → ``DEBUG`` – verbose: data previews, + cache hits, first-completion dumps, etc. + + The environment variable ``JUDGEARENA_LOG_LEVEL`` overrides the + CLI flags when set (e.g. ``JUDGEARENA_LOG_LEVEL=DEBUG``). + + log_file: Optional path to a log file. When provided, a + ``FileHandler`` at ``DEBUG`` level is attached so that the full + trace is always available on disk regardless of the console + verbosity. + """ + root = logging.getLogger(_ROOT_LOGGER_NAME) + + # Resolve level: env-var beats CLI flags. + if verbosity < 0: + level = logging.WARNING + elif verbosity == 0: + level = logging.INFO + else: + level = logging.DEBUG + + env_level = os.environ.get("JUDGEARENA_LOG_LEVEL") + if env_level: + level = getattr(logging, env_level.upper(), level) + + root.setLevel(min(level, logging.DEBUG)) # root allows everything; handlers filter + + # --- console handler --- + # Avoid duplicate handlers when configure_logging is called more than once + # (e.g. in tests). + if not root.handlers: + handler = logging.StreamHandler(sys.stderr) + handler.setLevel(level) + handler.setFormatter( + logging.Formatter(_CONSOLE_FORMAT, datefmt=_CONSOLE_DATEFMT) + ) + root.addHandler(handler) + else: + for h in root.handlers: + if isinstance(h, logging.StreamHandler) and not isinstance( + h, logging.FileHandler + ): + h.setLevel(level) + + # --- file handler (explicit --log-file) --- + if log_file is not None: + attach_file_handler(log_file) + + +def attach_file_handler( + log_file: str | Path, + level: int = logging.DEBUG, +) -> logging.FileHandler: + """Attach a file handler to the root ``judgearena`` logger. + + This is called automatically by :func:`configure_logging` when + ``log_file`` is provided, and can also be called later — for example + once the result folder is known — to save a ``run.log`` artifact + alongside evaluation results. + + Returns the handler so the caller can remove it if needed. + """ + root = logging.getLogger(_ROOT_LOGGER_NAME) + resolved_log_file = _resolve_log_path(log_file) + + for handler in root.handlers: + if not isinstance(handler, logging.FileHandler): + continue + handler_path = getattr(handler, "baseFilename", None) + if handler_path is None: + continue + if Path(handler_path).resolve() == resolved_log_file: + handler.setLevel(level) + handler.setFormatter(logging.Formatter(_FILE_FORMAT, datefmt=_FILE_DATEFMT)) + return handler + + resolved_log_file.parent.mkdir(parents=True, exist_ok=True) + fh = logging.FileHandler(resolved_log_file) + fh.setLevel(level) + fh.setFormatter(logging.Formatter(_FILE_FORMAT, datefmt=_FILE_DATEFMT)) + root.addHandler(fh) + return fh + + +def make_run_log_path(folder: str | Path) -> Path: + """Return a timestamped log path like ``folder/run-20260414_123456.log``. + + The timestamp ensures multiple runs never overwrite each other. + """ + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + return Path(folder) / f"run-{ts}.log" diff --git a/judgearena/mt_bench/mt_bench_utils.py b/judgearena/mt_bench/mt_bench_utils.py index 02277e0..b3149f0 100644 --- a/judgearena/mt_bench/mt_bench_utils.py +++ b/judgearena/mt_bench/mt_bench_utils.py @@ -18,6 +18,7 @@ from judgearena.eval_utils import _compute_grouped_stats, print_results from judgearena.generate import generate_multiturn from judgearena.instruction_dataset import load_instructions +from judgearena.log import get_logger from judgearena.mt_bench.fastchat_compat import ( FASTCHAT_TEMPERATURE_CONFIG, judge_mt_bench_pairwise_fastchat, @@ -25,6 +26,8 @@ from judgearena.repro import _to_jsonable from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model +logger = get_logger(__name__) + if TYPE_CHECKING: from judgearena.generate_and_evaluate import CliArgs @@ -73,26 +76,25 @@ def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str def _save_mt_bench_results( *, args: CliArgs, + res_folder: Path, + result_name: str, results: dict[str, object], annotations_df: pd.DataFrame, - name_suffix: str | None = None, ) -> None: - name = _build_mt_bench_result_name(args, suffix=name_suffix) - res_folder = Path(args.result_folder) / name - res_folder.mkdir(parents=True, exist_ok=True) - - with open(res_folder / f"args-{name}.json", "w") as f: + with open(res_folder / f"args-{result_name}.json", "w") as f: json.dump(_to_jsonable(asdict(args)), f, indent=2, allow_nan=False) - annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False) + annotations_df.to_csv(res_folder / f"{result_name}-annotations.csv", index=False) - with open(res_folder / f"results-{name}.json", "w") as f: + with open(res_folder / f"results-{result_name}.json", "w") as f: json.dump(_to_jsonable(results), f, indent=2, allow_nan=False) def _run_mt_bench_fastchat( *, args: CliArgs, + res_folder: Path, + result_name: str, questions_df: pd.DataFrame, completions_a: pd.DataFrame, completions_b: pd.DataFrame, @@ -131,18 +133,27 @@ def _run_mt_bench_fastchat( print_results(results) _save_mt_bench_results( args=args, + res_folder=res_folder, + result_name=result_name, results=results, annotations_df=pd.DataFrame(annotations), - name_suffix="mtbench", ) return prefs -def run_mt_bench(args: CliArgs, ignore_cache: bool): +def run_mt_bench( + args: CliArgs, + ignore_cache: bool, + *, + res_folder: Path, + result_name: str, +): """MT-Bench pipeline with FastChat-compatible pairwise judging.""" questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) - print( - f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." + logger.info( + "Generating multi-turn completions for MT-Bench with %s and %s.", + args.model_A, + args.model_B, ) completions_a, completions_b = _generate_mt_bench_completions( args=args, @@ -158,6 +169,8 @@ def run_mt_bench(args: CliArgs, ignore_cache: bool): ) return _run_mt_bench_fastchat( args=args, + res_folder=res_folder, + result_name=result_name, questions_df=questions_df, completions_a=completions_a, completions_b=completions_b, diff --git a/judgearena/utils.py b/judgearena/utils.py index b6b0b8d..573fd76 100644 --- a/judgearena/utils.py +++ b/judgearena/utils.py @@ -12,11 +12,15 @@ from langchain_core.globals import set_llm_cache from langchain_openai import ChatOpenAI from tqdm.asyncio import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm from judgearena.instruction_dataset.arena_hard import ( download_arena_hard, is_arena_hard_dataset, ) +from judgearena.log import get_logger + +logger = get_logger(__name__) def _data_root_path() -> Path: @@ -144,8 +148,12 @@ async def process_single(input_item, max_retries=5, base_delay=1.0): if attempt == max_retries - 1 or not _is_retryable_error(e): raise delay = base_delay * (2**attempt) - print( - f"Retry because of a server error, {attempt + 1}/{max_retries}: {e}. Waiting {delay}s..." + logger.warning( + "Retry because of a server error, %d/%d: %s. Waiting %ss...", + attempt + 1, + max_retries, + e, + delay, ) await asyncio.sleep(delay) @@ -153,7 +161,7 @@ async def process_single(input_item, max_retries=5, base_delay=1.0): results = await asyncio.gather(*[process_single(inp) for inp in inputs]) return results - with tqdm(total=len(inputs)) as pbar: + with logging_redirect_tqdm(), tqdm(total=len(inputs)) as pbar: res = asyncio.run( process_with_real_progress( chat_model=chat_model, inputs=inputs, pbar=pbar @@ -179,8 +187,13 @@ def batch_with_retry(batch_inputs, max_retries=5, base_delay=1.0): raise delay = base_delay * (2**attempt) next_chunks = 4 ** (attempt + 1) - print( - f"Retry because of a server error, {attempt + 1}/{max_retries}: {e}. Waiting {delay}s, then splitting into {next_chunks} chunks..." + logger.warning( + "Retry because of a server error, %d/%d: %s. Waiting %ss, then splitting into %d chunks...", + attempt + 1, + max_retries, + e, + delay, + next_chunks, ) time.sleep(delay) @@ -274,7 +287,7 @@ def __init__( if chat_template: self.chat_template = chat_template self._use_generate = False - print(f"ChatVLLM: using explicit chat template for '{model}'") + logger.info("ChatVLLM: using explicit chat template for '%s'", model) else: tokenizer = self.llm.get_tokenizer() if not getattr(tokenizer, "chat_template", None): @@ -289,7 +302,7 @@ def __init__( else: self.chat_template = None # let vLLM use the tokenizer's own self._use_generate = False - print(f"ChatVLLM: using tokenizer's chat template for '{model}'") + logger.info("ChatVLLM: using tokenizer's chat template for '%s'", model) def _to_messages(self, input_item) -> list[dict]: """Convert LangChain prompt input to OpenAI-style messages.""" @@ -399,7 +412,7 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): return DummyModel(model) model_name = "/".join(model.split("/")[1:]) - print(f"Loading {model_provider}(model={model_name})") + logger.info("Loading %s(model=%s)", model_provider, model_name) # Use our custom ChatVLLM wrapper which properly applies chat templates if model_provider == "VLLM": @@ -434,13 +447,13 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): model_classes.append(Together) except ImportError as e: - print(str(e)) + logger.debug("Optional provider not available: %s", e) try: from langchain_openai.llms import OpenAI model_classes.append(OpenAI) except ImportError as e: - print(str(e)) + logger.debug("Optional provider not available: %s", e) model_cls_dict = {model_cls.__name__: model_cls for model_cls in model_classes} assert model_provider in model_cls_dict, ( f"{model_provider} not available, choose among {list(model_cls_dict.keys())}" @@ -449,7 +462,7 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): def download_all(): - print(f"Downloading all dataset in {data_root}") + logger.info("Downloading all datasets in %s", data_root) local_path_tables = data_root / "tables" for dataset in [ "alpaca-eval", @@ -492,7 +505,7 @@ def __exit__(self, *args): self.end = time.time() self.duration = self.end - self.start if self.verbose: - print(self) + logger.info("%s", self) def __str__(self): name = self.name if self.name else "block" @@ -524,14 +537,15 @@ def cache_function_dataframe( cache_file = cache_path / (cache_name + ".csv.zip") cache_file.parent.mkdir(parents=True, exist_ok=True) if cache_file.exists() and not ignore_cache: - print(f"Loading cache {cache_file}") + logger.info("Loading cache %s", cache_file) if parquet: return pd.read_parquet(cache_file) else: return pd.read_csv(cache_file) else: - print( - f"Cache {cache_file} not found or ignore_cache set to True, regenerating the file" + logger.info( + "Cache %s not found or ignore_cache set to True, regenerating the file", + cache_file, ) with Timeblock("Evaluate function."): df = fun() diff --git a/pyproject.toml b/pyproject.toml index c4b20bc..7f90892 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project.scripts] judgearena = "judgearena.generate_and_evaluate:cli" -judgearena-elo = "judgearena.estimate_elo_ratings:main" +judgearena-elo = "judgearena.estimate_elo_ratings:cli" [project] name = "judgearena" diff --git a/tests/test_logging.py b/tests/test_logging.py new file mode 100644 index 0000000..0057e6f --- /dev/null +++ b/tests/test_logging.py @@ -0,0 +1,146 @@ +"""Tests for judgearena.log – structured logging infrastructure.""" + +from __future__ import annotations + +import argparse +import logging + +import pytest + +from judgearena.log import ( + _ROOT_LOGGER_NAME, + attach_file_handler, + configure_logging, + get_logger, + make_run_log_path, +) + + +@pytest.fixture(autouse=True) +def _reset_root_logger(): + """Remove handlers added by configure_logging so tests don't leak state.""" + root = logging.getLogger(_ROOT_LOGGER_NAME) + yield + root.handlers.clear() + root.setLevel(logging.WARNING) + + +def _console_handler_level() -> int: + """Return the level of the console (non-file) handler.""" + root = logging.getLogger(_ROOT_LOGGER_NAME) + for h in root.handlers: + if isinstance(h, logging.StreamHandler) and not isinstance( + h, logging.FileHandler + ): + return h.level + raise AssertionError("No console handler found") + + +# ---------- get_logger ---------- + + +@pytest.mark.parametrize( + "input_name, expected", + [ + (None, _ROOT_LOGGER_NAME), + ("judgearena.utils", "judgearena.utils"), + ("mymodule", "judgearena.mymodule"), # bare names get prefixed + ], +) +def test_get_logger_naming(input_name, expected): + assert get_logger(input_name).name == expected + + +# ---------- configure_logging ---------- + + +@pytest.mark.parametrize( + "verbosity, expected_level", + [(-1, logging.WARNING), (0, logging.INFO), (1, logging.DEBUG), (3, logging.DEBUG)], +) +def test_configure_logging_verbosity(verbosity, expected_level): + configure_logging(verbosity) + assert _console_handler_level() == expected_level + + +def test_configure_logging_no_duplicate_handlers(): + """Calling configure_logging twice must not add a second console handler.""" + configure_logging(0) + configure_logging(1) + root = logging.getLogger(_ROOT_LOGGER_NAME) + console_handlers = [ + h + for h in root.handlers + if isinstance(h, logging.StreamHandler) + and not isinstance(h, logging.FileHandler) + ] + assert len(console_handlers) == 1 + + +def test_env_var_overrides_verbosity(monkeypatch): + """JUDGEARENA_LOG_LEVEL env-var should override the CLI verbosity flag.""" + monkeypatch.setenv("JUDGEARENA_LOG_LEVEL", "warning") + configure_logging(1) # would normally be DEBUG + assert _console_handler_level() == logging.WARNING + + +# ---------- file handler ---------- + + +def test_file_handler_captures_debug_even_when_console_is_info(tmp_path): + """File handler should always capture DEBUG, even if console is INFO.""" + log_file = tmp_path / "debug_capture.log" + configure_logging(0, log_file=log_file) # console = INFO + + logger = get_logger("judgearena.test_debug") + logger.debug("only in file") + + for h in logging.getLogger(_ROOT_LOGGER_NAME).handlers: + h.flush() + + text = log_file.read_text() + assert "only in file" in text + + +def test_attach_file_handler_is_idempotent_for_same_path(tmp_path): + log_file = tmp_path / "run.log" + + first = attach_file_handler(log_file) + second = attach_file_handler(log_file) + + root = logging.getLogger(_ROOT_LOGGER_NAME) + file_handlers = [h for h in root.handlers if isinstance(h, logging.FileHandler)] + + assert first is second + assert len(file_handlers) == 1 + + +def test_attach_file_handler_creates_parent_directory(tmp_path): + log_file = tmp_path / "nested" / "logs" / "run.log" + + attach_file_handler(log_file) + + assert log_file.parent.exists() + + +# ---------- resolve_verbosity ---------- + + +def test_resolve_verbosity_quiet_overrides_verbose(): + """When both -q and -v are set, quiet wins (returns -1).""" + from judgearena.cli_common import resolve_verbosity + + ns = argparse.Namespace(verbose=2, quiet=True) + assert resolve_verbosity(ns) == -1 + + +# ---------- make_run_log_path ---------- + + +def test_make_run_log_path_format(tmp_path): + path = make_run_log_path(tmp_path) + assert path.parent == tmp_path + assert path.name.startswith("run-") + assert path.suffix == ".log" + # Timestamp portion: YYYYMMDD_HHMMSS (15 chars) + assert len(path.stem) == len("run-") + 15