From 2b09c192317915196cd219fee07a3290be966f32 Mon Sep 17 00:00:00 2001 From: hjenryin <72179072+hjenryin@users.noreply.github.com> Date: Sat, 3 Jan 2026 19:07:28 -0800 Subject: [PATCH 1/3] Supporting lm-eval>=0.4.8 --- eval/eval.py | 25 ++++++++++++------------- eval/eval_tracker.py | 5 ++++- pyproject.toml | 7 +++++-- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/eval/eval.py b/eval/eval.py index b8563ee5..14ae5dd7 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -15,7 +15,6 @@ import torch.distributed as dist import yaml from lm_eval import evaluator as pretrain_evaluator -from lm_eval import utils from lm_eval.__main__ import parse_eval_args, setup_parser from lm_eval.api.model import LM from lm_eval.loggers import EvaluationTracker, WandbLogger @@ -31,6 +30,7 @@ from eval.eval_tracker import DCEvaluationTracker from eval.task import TaskManager as InstructTaskManager +eval_logger = logging.getLogger(__name__) _BIT_CAP = 15_000 @@ -170,8 +170,8 @@ def evaluate( Dictionary mapping task names to their evaluation results. Each result dictionary contains metrics specific to that task. """ - eval_logger = utils.eval_logger - eval_logger.setLevel(getattr(logging, f"{verbosity}")) + eval_logger = logging.getLogger(__name__) + eval_logger.setLevel(getattr(logging, verbosity or "INFO")) # Split tasks between benchmark and pretrain benchmark_tasks = [t for t in task_list if t in task_manager.tasks] @@ -352,16 +352,16 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: try: model_name = evaluation_tracker.get_model_attribute_from_db(args.model_id, "weights_location") args.model_args = update_model_args_with_name(args.model_args or "", model_name) - utils.eval_logger.info(f"Retrieved model name from database: {model_name}") + eval_logger.info(f"Retrieved model name from database: {model_name}") except Exception as e: - utils.eval_logger.error(f"Failed to retrieve model name from database: {str(e)}") + eval_logger.error(f"Failed to retrieve model name from database: {str(e)}") sys.exit(1) if not args.overwrite_database: task_list = [ task for task in task_list if not evaluation_tracker.check_if_already_done(task, args.model_id) ] if len(task_list) == 0: - utils.eval_logger.info("All tasks passed in were found in the database.") + eval_logger.info("All tasks passed in were found in the database.") exit() elif args.model_name: model_name = args.model_name @@ -376,9 +376,9 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: task_list=task_list, system_instruction=args.system_instruction, ) - pretrain_task_manager = PretrainTaskManager(args.verbosity, include_path=args.include_path) + pretrain_task_manager = PretrainTaskManager(args.verbosity or "INFO", include_path=args.include_path) - utils.eval_logger.info(f"Selected Tasks: {[task for task in task_list]}") + eval_logger.info(f"Selected Tasks: {[task for task in task_list]}") # Only check for OpenAI API keys if at least one task requires an annotator model # TODO: Should we just skip the evaluation that requires the annotator model if the annotator model is not set or fail completely? @@ -401,7 +401,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: try: lm = initialize_model(args.model, args.model_args, batch_size=args.batch_size) except Exception as e: - utils.eval_logger.error(f"Failed to initialize model: {str(e)}") + eval_logger.error(f"Failed to initialize model: {str(e)}") sys.exit(1) # Log experiment configuration @@ -415,8 +415,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: ) # Initialize logging and environment - eval_logger = utils.eval_logger - eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) + eval_logger.setLevel(getattr(logging, args.verbosity or "INFO")) os.environ["TOKENIZERS_PARALLELISM"] = "false" # Setup wandb logging if requested @@ -614,7 +613,7 @@ def handle_evaluation_output( if args.log_samples: wandb_logger.log_eval_samples(samples) except Exception as e: - utils.eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None) if args.use_database and not args.debug: @@ -632,7 +631,7 @@ def handle_evaluation_output( for task_name, config in results["configs"].items(): evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name]) - utils.eval_logger.info( + eval_logger.info( f"Eval arugments: {args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), " f"limit: {args.limit}, num_fewshot: {args.num_fewshot}, annotator_model: {args.annotator_model}, " f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" diff --git a/eval/eval_tracker.py b/eval/eval_tracker.py index a1a879ca..d11126ea 100644 --- a/eval/eval_tracker.py +++ b/eval/eval_tracker.py @@ -1,5 +1,6 @@ import getpass import json +import logging import re import subprocess import time @@ -13,7 +14,9 @@ import torch from huggingface_hub import model_info from lm_eval.loggers.evaluation_tracker import GeneralConfigTracker -from lm_eval.utils import eval_logger, handle_non_serializable, hash_string, simple_parse_args_string +from lm_eval.utils import handle_non_serializable, hash_string, simple_parse_args_string + +eval_logger = logging.getLogger(__name__) from database.models import Dataset, EvalResult, EvalSetting, Model from database.utils import create_db_engine, create_tables, get_model_from_db, get_or_add_model_by_name, sessionmaker diff --git a/pyproject.toml b/pyproject.toml index e39e4043..ca80890c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ dependencies = [ "reka-api", "together", "dashscope", - "fschat @ file:eval/chat_benchmarks/MTBench", # Use relative path that pip will resolve during installation + "fschat", # Use relative path that pip will resolve during installation, may not work with pip but work with uv pip # Cloud & Storage "gcsfs", @@ -154,7 +154,7 @@ dependencies = [ "swebench>=3.0.4", # LM Eval - "lm-eval[vllm] @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix", + "lm-eval[vllm]" ] [project.urls] @@ -193,3 +193,6 @@ packages = ["evalchemy"] [tool.setuptools.package-dir] evalchemy = "evalchemy" + +[tool.uv.sources] +fschat = { path = "eval/chat_benchmarks/MTBench", editable = true } \ No newline at end of file From 76b134a6e3512f553e5a15c3908269a22b52e069 Mon Sep 17 00:00:00 2001 From: hjenryin <72179072+hjenryin@users.noreply.github.com> Date: Sat, 3 Jan 2026 19:16:23 -0800 Subject: [PATCH 2/3] Improved doc with local multiGPU support; use uv to install. --- README.md | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 987a2697..a9709584 100644 --- a/README.md +++ b/README.md @@ -78,8 +78,9 @@ git clone git@github.com:mlfoundations/evalchemy.git cd evalchemy # Install dependencies -pip install -e . -pip install -e eval/chat_benchmarks/alpaca_eval +pip install uv +uv pip install -e . +uv pip install -e eval/chat_benchmarks/alpaca_eval # Note: On some HPC systems you may need to modify pyproject.toml # to use absolute paths for the fschat dependency: @@ -211,6 +212,28 @@ NOTE: This is configured for specific HPC clusters, but can easily be adapted. F ### Multi-GPU Evaluation +A. Data-Parallel Evaluation with vllm +```bash +python -m eval.eval \ + --model vllm \ + --tasks AIME24 \ + --model_args "pretrained=Qwen/Qwen2.5-7B-Instruct,data_parallel_size=8" \ + --batch_size auto \ + --output_path logs \ + --apply_chat_template True +``` +If the model is too large, you can also combine tensor parallelism: +```bash +python -m eval.eval \ + --model vllm \ + --tasks AIME24 \ + --model_args "pretrained=Qwen/Qwen2.5-7B-Instruct,tensor_parallel_size=2,data_parallel_size=4" \ + --batch_size auto \ + --output_path logs \ + --apply_chat_template True +``` + +B. Multi-GPU Evaluation with Accelerate NOTE: this is slower than doing fully data parallel evaluation (see previous section) ```bash @@ -223,6 +246,7 @@ accelerate launch --num-processes --num-machines \ --output_path logs ``` + ### Large Model Evaluation For models that don't fit on a single GPU, use model parallelism: From 9a83ff1fdb25b0d1b68160b3b1cd5a0034ee36b4 Mon Sep 17 00:00:00 2001 From: hjenryin <72179072+hjenryin@users.noreply.github.com> Date: Mon, 5 Jan 2026 12:56:32 -0800 Subject: [PATCH 3/3] Use 0.4.9.1 or 0.4.9.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ca80890c..6e66736e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,7 +154,7 @@ dependencies = [ "swebench>=3.0.4", # LM Eval - "lm-eval[vllm]" + "lm-eval[vllm]>=0.4.9.1,<0.4.10" ] [project.urls]