Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,9 @@ git clone git@github.com:mlfoundations/evalchemy.git
cd evalchemy

# Install dependencies
pip install -e .
pip install -e eval/chat_benchmarks/alpaca_eval
pip install uv
uv pip install -e .
uv pip install -e eval/chat_benchmarks/alpaca_eval

# Note: On some HPC systems you may need to modify pyproject.toml
# to use absolute paths for the fschat dependency:
Expand Down Expand Up @@ -211,6 +212,28 @@ NOTE: This is configured for specific HPC clusters, but can easily be adapted. F

### Multi-GPU Evaluation

A. Data-Parallel Evaluation with vllm
```bash
python -m eval.eval \
--model vllm \
--tasks AIME24 \
--model_args "pretrained=Qwen/Qwen2.5-7B-Instruct,data_parallel_size=8" \
--batch_size auto \
--output_path logs \
--apply_chat_template True
```
If the model is too large, you can also combine tensor parallelism:
```bash
python -m eval.eval \
--model vllm \
--tasks AIME24 \
--model_args "pretrained=Qwen/Qwen2.5-7B-Instruct,tensor_parallel_size=2,data_parallel_size=4" \
--batch_size auto \
--output_path logs \
--apply_chat_template True
```

B. Multi-GPU Evaluation with Accelerate
NOTE: this is slower than doing fully data parallel evaluation (see previous section)

```bash
Expand All @@ -223,6 +246,7 @@ accelerate launch --num-processes <num-gpus> --num-machines <num-nodes> \
--output_path logs
```


### Large Model Evaluation

For models that don't fit on a single GPU, use model parallelism:
Expand Down
25 changes: 12 additions & 13 deletions eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import torch.distributed as dist
import yaml
from lm_eval import evaluator as pretrain_evaluator
from lm_eval import utils
from lm_eval.__main__ import parse_eval_args, setup_parser
from lm_eval.api.model import LM
from lm_eval.loggers import EvaluationTracker, WandbLogger
Expand All @@ -31,6 +30,7 @@
from eval.eval_tracker import DCEvaluationTracker
from eval.task import TaskManager as InstructTaskManager

eval_logger = logging.getLogger(__name__)

_BIT_CAP = 15_000

Expand Down Expand Up @@ -170,8 +170,8 @@ def evaluate(
Dictionary mapping task names to their evaluation results.
Each result dictionary contains metrics specific to that task.
"""
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
eval_logger = logging.getLogger(__name__)
eval_logger.setLevel(getattr(logging, verbosity or "INFO"))

# Split tasks between benchmark and pretrain
benchmark_tasks = [t for t in task_list if t in task_manager.tasks]
Expand Down Expand Up @@ -352,16 +352,16 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
try:
model_name = evaluation_tracker.get_model_attribute_from_db(args.model_id, "weights_location")
args.model_args = update_model_args_with_name(args.model_args or "", model_name)
utils.eval_logger.info(f"Retrieved model name from database: {model_name}")
eval_logger.info(f"Retrieved model name from database: {model_name}")
except Exception as e:
utils.eval_logger.error(f"Failed to retrieve model name from database: {str(e)}")
eval_logger.error(f"Failed to retrieve model name from database: {str(e)}")
sys.exit(1)
if not args.overwrite_database:
task_list = [
task for task in task_list if not evaluation_tracker.check_if_already_done(task, args.model_id)
]
if len(task_list) == 0:
utils.eval_logger.info("All tasks passed in were found in the database.")
eval_logger.info("All tasks passed in were found in the database.")
exit()
elif args.model_name:
model_name = args.model_name
Expand All @@ -376,9 +376,9 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
task_list=task_list,
system_instruction=args.system_instruction,
)
pretrain_task_manager = PretrainTaskManager(args.verbosity, include_path=args.include_path)
pretrain_task_manager = PretrainTaskManager(args.verbosity or "INFO", include_path=args.include_path)

utils.eval_logger.info(f"Selected Tasks: {[task for task in task_list]}")
eval_logger.info(f"Selected Tasks: {[task for task in task_list]}")

# Only check for OpenAI API keys if at least one task requires an annotator model
# TODO: Should we just skip the evaluation that requires the annotator model if the annotator model is not set or fail completely?
Expand All @@ -401,7 +401,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
try:
lm = initialize_model(args.model, args.model_args, batch_size=args.batch_size)
except Exception as e:
utils.eval_logger.error(f"Failed to initialize model: {str(e)}")
eval_logger.error(f"Failed to initialize model: {str(e)}")
sys.exit(1)

# Log experiment configuration
Expand All @@ -415,8 +415,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
)

# Initialize logging and environment
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.setLevel(getattr(logging, args.verbosity or "INFO"))
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Setup wandb logging if requested
Expand Down Expand Up @@ -614,7 +613,7 @@ def handle_evaluation_output(
if args.log_samples:
wandb_logger.log_eval_samples(samples)
except Exception as e:
utils.eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None)
if args.use_database and not args.debug:
Expand All @@ -632,7 +631,7 @@ def handle_evaluation_output(
for task_name, config in results["configs"].items():
evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name])

utils.eval_logger.info(
eval_logger.info(
f"Eval arugments: {args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), "
f"limit: {args.limit}, num_fewshot: {args.num_fewshot}, annotator_model: {args.annotator_model}, "
f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
Expand Down
5 changes: 4 additions & 1 deletion eval/eval_tracker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import getpass
import json
import logging
import re
import subprocess
import time
Expand All @@ -13,7 +14,9 @@
import torch
from huggingface_hub import model_info
from lm_eval.loggers.evaluation_tracker import GeneralConfigTracker
from lm_eval.utils import eval_logger, handle_non_serializable, hash_string, simple_parse_args_string
from lm_eval.utils import handle_non_serializable, hash_string, simple_parse_args_string

eval_logger = logging.getLogger(__name__)

from database.models import Dataset, EvalResult, EvalSetting, Model
from database.utils import create_db_engine, create_tables, get_model_from_db, get_or_add_model_by_name, sessionmaker
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ dependencies = [
"reka-api",
"together",
"dashscope",
"fschat @ file:eval/chat_benchmarks/MTBench", # Use relative path that pip will resolve during installation
"fschat", # Use relative path that pip will resolve during installation, may not work with pip but work with uv pip

# Cloud & Storage
"gcsfs",
Expand Down Expand Up @@ -154,7 +154,7 @@ dependencies = [
"swebench>=3.0.4",

# LM Eval
"lm-eval[vllm] @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix",
"lm-eval[vllm]>=0.4.9.1,<0.4.10"
]

[project.urls]
Expand Down Expand Up @@ -193,3 +193,6 @@ packages = ["evalchemy"]

[tool.setuptools.package-dir]
evalchemy = "evalchemy"

[tool.uv.sources]
fschat = { path = "eval/chat_benchmarks/MTBench", editable = true }