From 6783aa1865560d5a643bdd1befb0bd47ef70b906 Mon Sep 17 00:00:00 2001 From: jannalulu Date: Mon, 12 May 2025 23:19:29 +0000 Subject: [PATCH 1/4] update lm-eval-harness version --- eval/eval.py | 24 +++++++++++++----------- eval/eval_tracker.py | 3 ++- pyproject.toml | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/eval/eval.py b/eval/eval.py index ef561e04..f94a9df3 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -128,7 +128,7 @@ def evaluate( Dictionary mapping task names to their evaluation results. Each result dictionary contains metrics specific to that task. """ - eval_logger = utils.eval_logger + eval_logger = logging.getLogger(__name__) eval_logger.setLevel(getattr(logging, f"{verbosity}")) # Split tasks between benchmark and pretrain @@ -294,6 +294,10 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: int(args.batch_size) if args.batch_size != "auto" else args.batch_size for _ in range(len(args.tasks.split(","))) ] + + # Initialize logging + eval_logger = logging.getLogger(__name__) + eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) # Initialize evaluation tracker if args.output_path: @@ -309,16 +313,16 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: try: model_name = evaluation_tracker.get_model_attribute_from_db(args.model_id, "weights_location") args.model_args = update_model_args_with_name(args.model_args or "", model_name) - utils.eval_logger.info(f"Retrieved model name from database: {model_name}") + eval_logger.info(f"Retrieved model name from database: {model_name}") except Exception as e: - utils.eval_logger.error(f"Failed to retrieve model name from database: {str(e)}") + eval_logger.error(f"Failed to retrieve model name from database: {str(e)}") sys.exit(1) if not args.overwrite_database: task_list = [ task for task in task_list if not evaluation_tracker.check_if_already_done(task, args.model_id) ] if len(task_list) == 0: - utils.eval_logger.info("All tasks passed in were found in the database.") + eval_logger.info("All tasks passed in were found in the database.") exit() elif args.model_name: model_name = args.model_name @@ -334,7 +338,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: ) pretrain_task_manager = PretrainTaskManager(args.verbosity, include_path=args.include_path) - utils.eval_logger.info(f"Selected Tasks: {[task for task in task_list]}") + eval_logger.info(f"Selected Tasks: {[task for task in task_list]}") # Only check for OpenAI API keys if at least one task requires an annotator model # TODO: Should we just skip the evaluation that requires the annotator model if the annotator model is not set or fail completely? @@ -357,7 +361,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: try: lm = initialize_model(args.model, args.model_args, batch_size=args.batch_size) except Exception as e: - utils.eval_logger.error(f"Failed to initialize model: {str(e)}") + eval_logger.error(f"Failed to initialize model: {str(e)}") sys.exit(1) # Log experiment configuration @@ -370,9 +374,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: fewshot_as_multiturn=args.fewshot_as_multiturn, ) - # Initialize logging and environment - eval_logger = utils.eval_logger - eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) + # Initialize environment os.environ["TOKENIZERS_PARALLELISM"] = "false" # Setup wandb logging if requested @@ -562,7 +564,7 @@ def handle_evaluation_output( if args.log_samples: wandb_logger.log_eval_samples(samples) except Exception as e: - utils.eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None) if args.use_database and not args.debug: @@ -580,7 +582,7 @@ def handle_evaluation_output( for task_name, config in results["configs"].items(): evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name]) - utils.eval_logger.info( + eval_logger.info( f"Eval arugments: {args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), " f"limit: {args.limit}, num_fewshot: {args.num_fewshot}, annotator_model: {args.annotator_model}, " f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" diff --git a/eval/eval_tracker.py b/eval/eval_tracker.py index a1a879ca..c6e5edba 100644 --- a/eval/eval_tracker.py +++ b/eval/eval_tracker.py @@ -13,11 +13,12 @@ import torch from huggingface_hub import model_info from lm_eval.loggers.evaluation_tracker import GeneralConfigTracker -from lm_eval.utils import eval_logger, handle_non_serializable, hash_string, simple_parse_args_string +from lm_eval.utils import handle_non_serializable, hash_string, simple_parse_args_string from database.models import Dataset, EvalResult, EvalSetting, Model from database.utils import create_db_engine, create_tables, get_model_from_db, get_or_add_model_by_name, sessionmaker +eval_logger = logging.getLogger(__name__) def flatten_dict(d: Dict[str, Any], parent_key: str = "", sep: str = "/") -> Dict[str, Any]: """ diff --git a/pyproject.toml b/pyproject.toml index e39e4043..e499cba8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,7 +154,7 @@ dependencies = [ "swebench>=3.0.4", # LM Eval - "lm-eval[vllm] @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix", + "lm-eval[vllm] @ git+https://github.com/EleutherAI/lm-evaluation-harness@v0.4.8", ] [project.urls] From e5de729864561ff21b13e6ffc47f543ee5c228a6 Mon Sep 17 00:00:00 2001 From: jannalulu Date: Tue, 13 May 2025 00:34:46 +0000 Subject: [PATCH 2/4] move eval_logger out of functions --- eval/eval.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/eval/eval.py b/eval/eval.py index f94a9df3..29ae17ce 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -29,6 +29,7 @@ from eval.eval_tracker import DCEvaluationTracker from eval.task import TaskManager as InstructTaskManager +eval_logger = logging.getLogger(__name__) def setup_custom_parser(): """ @@ -128,7 +129,6 @@ def evaluate( Dictionary mapping task names to their evaluation results. Each result dictionary contains metrics specific to that task. """ - eval_logger = logging.getLogger(__name__) eval_logger.setLevel(getattr(logging, f"{verbosity}")) # Split tasks between benchmark and pretrain @@ -296,7 +296,6 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: ] # Initialize logging - eval_logger = logging.getLogger(__name__) eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) # Initialize evaluation tracker From dab7a278b359572fc6a38d57897d1590350867cd Mon Sep 17 00:00:00 2001 From: Janna <109004049+jannalulu@users.noreply.github.com> Date: Tue, 13 May 2025 19:00:40 -0400 Subject: [PATCH 3/4] fix logging --- eval/eval.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/eval/eval.py b/eval/eval.py index 29ae17ce..f3e2eda7 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -20,7 +20,7 @@ from lm_eval.loggers import EvaluationTracker, WandbLogger from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash from lm_eval.tasks import TaskManager as PretrainTaskManager -from lm_eval.utils import handle_non_serializable, sanitize_model_name, simple_parse_args_string +from lm_eval.utils import setup_logging, handle_non_serializable, sanitize_model_name, simple_parse_args_string from eval.chat_benchmarks.curator_lm import CuratorAPIModel # register curator model from eval.chat_benchmarks.precomputed_hf_lm import PrecomputedHFLM # register precomputed_hf model @@ -129,7 +129,8 @@ def evaluate( Dictionary mapping task names to their evaluation results. Each result dictionary contains metrics specific to that task. """ - eval_logger.setLevel(getattr(logging, f"{verbosity}")) + if verbosity is not None: + setup_logging(verbosity=verbosity) # Split tasks between benchmark and pretrain benchmark_tasks = [t for t in task_list if t in task_manager.tasks] @@ -294,9 +295,6 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: int(args.batch_size) if args.batch_size != "auto" else args.batch_size for _ in range(len(args.tasks.split(","))) ] - - # Initialize logging - eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) # Initialize evaluation tracker if args.output_path: From fed9468135a21988f3a78639f2a3dd6f62d07eae Mon Sep 17 00:00:00 2001 From: Janna <109004049+jannalulu@users.noreply.github.com> Date: Tue, 13 May 2025 19:01:22 -0400 Subject: [PATCH 4/4] Update eval_tracker.py --- eval/eval_tracker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval/eval_tracker.py b/eval/eval_tracker.py index c6e5edba..1e883802 100644 --- a/eval/eval_tracker.py +++ b/eval/eval_tracker.py @@ -13,7 +13,7 @@ import torch from huggingface_hub import model_info from lm_eval.loggers.evaluation_tracker import GeneralConfigTracker -from lm_eval.utils import handle_non_serializable, hash_string, simple_parse_args_string +from lm_eval.utils import setup_logging, handle_non_serializable, hash_string, simple_parse_args_string from database.models import Dataset, EvalResult, EvalSetting, Model from database.utils import create_db_engine, create_tables, get_model_from_db, get_or_add_model_by_name, sessionmaker