From 21481c04b84a93608ee7fd67d13b436a98481cee Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 07:47:12 -0700 Subject: [PATCH 01/12] mixeval evaluator --- mttl/evaluators/mixeval.py | 182 +++++++++++++++++++++++++++++++++++++ mttl/models/base_model.py | 17 ++++ 2 files changed, 199 insertions(+) create mode 100644 mttl/evaluators/mixeval.py diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py new file mode 100644 index 000000000..33eb0ef81 --- /dev/null +++ b/mttl/evaluators/mixeval.py @@ -0,0 +1,182 @@ +import json +import os + +try: + from mix_eval.api.registry import register_model + from mix_eval.evaluate import compute_metrics_p, eval, parse_args + from mix_eval.models.base import ChatModel + + mixeval_available = True + +except ImportError: + mixeval_available = False + register_model = lambda x: x + + +from copy import deepcopy +from dataclasses import dataclass + +import torch +from transformers import AutoTokenizer + +from mttl.datamodule.utils import get_tokenizer_with_args +from mttl.evaluators.base import GenerativeEvaluator +from mttl.models.expert_model import MultiExpertModel, MultiExpertModelConfig +from mttl.models.library.expert_library import ExpertLibrary + + +@dataclass +class MixEvalConfig: + batch_size: int = 16 + model_name: str = "mix_eval_expert_adapter" + benchmark: str = "mixeval_hard" + data_path: str = None + version: str = "2024-08-11" + split: str = None + output_dir: str = None + verbose: bool = False + model: MultiExpertModel = None + + +@register_model("mix_eval_expert_adapter") +class MultiExpertAdapter(ChatModel): + def chunk_generate( + self, + inputs, + model, + tok, + max_tokens: int, + sliding_window: int = 128 * 1024, + chunk_size: int = 2500, + verbose: bool = False, + chunked: bool = False, + **kwargs, + ): + if chunked: + raise ValueError("Chunked is not supported.") + + with torch.no_grad(): + input_ids = inputs.input_ids # (b, n) + attention_mask = inputs.attention_mask # (b, n) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=max_tokens, + **kwargs, + ) + generated_ids = [ + output_ids[len(in_ids) :] for in_ids, output_ids in zip(input_ids, outputs) + ] + responses = tok.batch_decode(generated_ids, skip_special_tokens=True) + return responses + + def __init__(self, args): + self.model = args.model + args.model = None + + super().__init__(args) + + self.tokenizer = get_tokenizer_with_args( + model_name=self.model.base_model_name_or_path, + model_family="gpt", + padding_side="left", + truncation_side="left", + for_generation=True, + ) + + self.SYSTEM_MESSAGE = { + "role": "system", + "content": "You are a helpful assistant.", + } # set to None if no system message + self.USER_MESSAGE_TEMPLATE = lambda x: {"role": "user", "content": x} + self.ASSISTANT_MESSAGE_TEMPLATE = lambda x: {"role": "assistant", "content": x} + + self.model_max_len = self.model.max_position_embeddings + self.max_input_length_closeend = ( + min(self.model_max_len, self.max_input_length) + - self.closeended_max_new_tokens + ) + self.max_input_length_openend = ( + min(self.model_max_len, self.max_input_length) + - self.openended_max_new_tokens + ) + + +class MixEvalEvaluator(GenerativeEvaluator): + def __init__( + self, + ): + super().__init__(config=MixEvalConfig()) + + if not mixeval_available: + raise ValueError( + "MixEval is not installed. Please install it using `pip install mix-eval`." + ) + + self.download_data() + + def download_data(self): + import shutil + import subprocess + + import mix_eval + + repo_url = "https://github.com/Psycoy/MixEval.git" + data_folder = "mix_eval/data" + temp_dir = "/tmp/mixeval_repo" + target_dir = os.path.join(os.path.dirname(mix_eval.__file__), "data") + + self.config.data_path = target_dir + + if os.path.exists(target_dir): + return + + # Clone the repository + subprocess.run(["git", "clone", repo_url, temp_dir], check=True) + + # Copy the data folder to the target directory + shutil.copytree( + os.path.join(temp_dir, data_folder), target_dir, dirs_exist_ok=True + ) + + # Clean up the temporary directory + shutil.rmtree(temp_dir) + + def evaluate( + self, + model, + split=None, + shuffle=False, + subsample=-1, + output_path=None, + verbose=False, + **kwargs, + ): + # inject model into config + self.config.model = model + self.config.verbose = verbose + + if split is not None: + self.config.split = split + + if output_path is not None: + self.config.output_dir = output_path + else: + raise ValueError("Output path is required for evaluation.") + + eval(self.config) + compute_metrics_p(self.config) + + with open(os.path.join(self.config.output_dir, "score.json"), "r") as f: + score = json.load(f) + return score[self.config.model_name]["overall"] + + +if __name__ == "__main__": + evaluator = MixEvalEvaluator() + model = MultiExpertModel( + MultiExpertModelConfig(base_model="microsoft/Phi-3-mini-4k-instruct"), + device_map="cuda:0", + ) + evaluator.evaluate(model, output_path="/tmp/mixeval/") diff --git a/mttl/models/base_model.py b/mttl/models/base_model.py index 999335bbd..7b936e8fe 100644 --- a/mttl/models/base_model.py +++ b/mttl/models/base_model.py @@ -5,6 +5,7 @@ import torch from huggingface_hub import hf_hub_download +from transformers import PreTrainedModel from transformers.modeling_outputs import CausalLMOutput from mttl.logging import logger @@ -64,6 +65,10 @@ def __init__( if model_object is None else model_object ) + if not isinstance(self.model, PreTrainedModel): + raise ValueError( + f"Model is not a subclass of PreTrainedModel. Got {type(self.model)}." + ) if model_object: logger.warning( @@ -73,6 +78,18 @@ def __init__( self.config = config self.loading_kwargs = loading_kwargs + @property + def base_model_name_or_path(self) -> str: + return self.config.base_model + + @property + def max_position_embeddings(self) -> int: + return self.base_model.config.max_position_embeddings + + @property + def base_model(self) -> PreTrainedModel: + return self.model + def _delete_non_trainable_params( self, state_dict: Dict[str, torch.Tensor] ) -> Dict[str, torch.Tensor]: From 54faf2ced8d06dd8c69aa271d7483a4ccc20900c Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 07:55:48 -0700 Subject: [PATCH 02/12] inject model --- mttl/evaluators/mixeval.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index 33eb0ef81..3340fdbc7 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -35,7 +35,6 @@ class MixEvalConfig: split: str = None output_dir: str = None verbose: bool = False - model: MultiExpertModel = None @register_model("mix_eval_expert_adapter") @@ -73,7 +72,6 @@ def chunk_generate( def __init__(self, args): self.model = args.model - args.model = None super().__init__(args) From e7b28824b2602a70ef3c6ab18dd785256b7f8f43 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 07:59:33 -0700 Subject: [PATCH 03/12] inject model by threading.local instead of using the args --- mttl/evaluators/mixeval.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index 3340fdbc7..fda021abc 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -1,5 +1,6 @@ import json import os +import threading try: from mix_eval.api.registry import register_model @@ -39,6 +40,9 @@ class MixEvalConfig: @register_model("mix_eval_expert_adapter") class MultiExpertAdapter(ChatModel): + # model context is used to inject model into the class + model_context = threading.local() + def chunk_generate( self, inputs, @@ -71,10 +75,9 @@ def chunk_generate( return responses def __init__(self, args): - self.model = args.model - super().__init__(args) + self.model = self.model_context.model self.tokenizer = get_tokenizer_with_args( model_name=self.model.base_model_name_or_path, model_family="gpt", @@ -151,8 +154,10 @@ def evaluate( verbose=False, **kwargs, ): + # inject model into MultiExpertAdapter + MultiExpertAdapter.model_context.model = model + # inject model into config - self.config.model = model self.config.verbose = verbose if split is not None: From 530e22ab8aa2ebcac8adee6a100de5100363d311 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 08:00:30 -0700 Subject: [PATCH 04/12] provide mixevalconfig --- mttl/evaluators/mixeval.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index fda021abc..c568ca2a4 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -105,10 +105,8 @@ def __init__(self, args): class MixEvalEvaluator(GenerativeEvaluator): - def __init__( - self, - ): - super().__init__(config=MixEvalConfig()) + def __init__(self, config: MixEvalConfig = None): + super().__init__(config=config or MixEvalConfig()) if not mixeval_available: raise ValueError( From dce1483a32544714f2d4d0f691b85dae2f5900d3 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 08:20:35 -0700 Subject: [PATCH 05/12] mixeval config --- mttl/evaluators/mixeval.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index c568ca2a4..a10232549 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -32,6 +32,12 @@ class MixEvalConfig: model_name: str = "mix_eval_expert_adapter" benchmark: str = "mixeval_hard" data_path: str = None + free_form_parser: str = "model" + multi_choice_parser: str = "model" + multichoice_judge: str = "gpt-3.5-turbo-0125" + freeform_judge: str = "gpt-3.5-turbo-0125" + extract_base_model_response: bool = False + compute_score_from_judged_file: bool = False version: str = "2024-08-11" split: str = None output_dir: str = None From bdd05625e5cd5cfc7fc10a8a330af5445fa6678a Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 09:38:41 -0700 Subject: [PATCH 06/12] gpt-4o by default --- mttl/evaluators/mixeval.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index a10232549..f3ab4fc51 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -34,14 +34,15 @@ class MixEvalConfig: data_path: str = None free_form_parser: str = "model" multi_choice_parser: str = "model" - multichoice_judge: str = "gpt-3.5-turbo-0125" - freeform_judge: str = "gpt-3.5-turbo-0125" + multichoice_judge: str = "gpt-4o" + freeform_judge: str = "gpt-4o" extract_base_model_response: bool = False compute_score_from_judged_file: bool = False version: str = "2024-08-11" split: str = None output_dir: str = None verbose: bool = False + api_parallel_num: int = 10 @register_model("mix_eval_expert_adapter") @@ -158,6 +159,8 @@ def evaluate( verbose=False, **kwargs, ): + from mix_eval.compute_metrics import AVAILABLE_MODELS + # inject model into MultiExpertAdapter MultiExpertAdapter.model_context.model = model @@ -173,6 +176,8 @@ def evaluate( raise ValueError("Output path is required for evaluation.") eval(self.config) + + AVAILABLE_MODELS[self.config.model_name] = "MultiExpertAdapter" compute_metrics_p(self.config) with open(os.path.join(self.config.output_dir, "score.json"), "r") as f: From de5173b338b6c267aab3175a33c3082c2cdb4594 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 09:40:56 -0700 Subject: [PATCH 07/12] gpt-4o-mini by default --- mttl/evaluators/mixeval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index f3ab4fc51..7c5d9adf2 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -34,8 +34,8 @@ class MixEvalConfig: data_path: str = None free_form_parser: str = "model" multi_choice_parser: str = "model" - multichoice_judge: str = "gpt-4o" - freeform_judge: str = "gpt-4o" + multichoice_judge: str = "gpt-4o-mini" + freeform_judge: str = "gpt-4o-mini" extract_base_model_response: bool = False compute_score_from_judged_file: bool = False version: str = "2024-08-11" From a71c7b0c8720425d35261e08dac428221047932f Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 09:44:17 -0700 Subject: [PATCH 08/12] return value fix --- mttl/evaluators/mixeval.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index 7c5d9adf2..da3e372e5 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -153,8 +153,6 @@ def evaluate( self, model, split=None, - shuffle=False, - subsample=-1, output_path=None, verbose=False, **kwargs, @@ -177,12 +175,13 @@ def evaluate( eval(self.config) + # for some reason, available models is filled by hand rather than by the decorator, /shrug AVAILABLE_MODELS[self.config.model_name] = "MultiExpertAdapter" compute_metrics_p(self.config) with open(os.path.join(self.config.output_dir, "score.json"), "r") as f: score = json.load(f) - return score[self.config.model_name]["overall"] + return score[self.config.model_name]["overall score (final score)"] if __name__ == "__main__": From 25987edd6c509311f1ec8968a3555f597fc83394 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 12:30:45 -0700 Subject: [PATCH 09/12] guard task-names --- .../containers/selectors/per_token_selector.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mttl/models/containers/selectors/per_token_selector.py b/mttl/models/containers/selectors/per_token_selector.py index babe76be6..43078d1b9 100644 --- a/mttl/models/containers/selectors/per_token_selector.py +++ b/mttl/models/containers/selectors/per_token_selector.py @@ -106,7 +106,11 @@ def _log_angle(self, angle): else: mean_angle = angle.mean() - task = self.routing_infos.task_names[0] + task_names = self.routing_infos.task_names + if task_names is None: + return + + task = task_names[0] to_store = {"angle": mean_angle.item()} self.metric_logger.update(prefix=f"task_{task}", value_dict=to_store) @@ -126,7 +130,11 @@ def _log_entropy(self, logits): else: mean_entropy = entropy.mean() - task = self.routing_infos.task_names[0] + task_names = self.routing_infos.task_names + if task_names is None: + return + + task = task_names[0] to_store = {"ent_routing": mean_entropy.item()} self.metric_logger.update(prefix=f"task_{task}", value_dict=to_store) @@ -139,7 +147,10 @@ def _log_entropy(self, logits): def _maybe_log_in_dist(self, logits): probs = F.softmax(logits, dim=-1) bs, seq_len, _ = probs.size() + task_names = self.routing_infos.task_names + if task_names is None: + return if all([t in self.task_to_expert_name for t in task_names]): expert_names = [self.task_to_expert_name[t] for t in task_names] From 88888036422f53a342f20fed43f39998dae5cd4f Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Sun, 1 Sep 2024 12:42:23 -0700 Subject: [PATCH 10/12] raise only if context is defined --- mttl/models/packed_attention_monkey_patch.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mttl/models/packed_attention_monkey_patch.py b/mttl/models/packed_attention_monkey_patch.py index 188cc9faf..51bf6feb6 100644 --- a/mttl/models/packed_attention_monkey_patch.py +++ b/mttl/models/packed_attention_monkey_patch.py @@ -48,11 +48,11 @@ def flash_attn_varlen_func_wrapper( causal, **flash_kwargs, ): - if query_states.shape != key_states.shape: - raise ValueError("q and k must have the same shape") - context = InfoContainer.get() if context is not None and context.routing_infos.packed_seq_lens is not None: + if query_states.shape != key_states.shape: + raise ValueError("q and k must have the same shape") + warn_once( "\n\n\n\nUsing the Flash Attention 2 Sequence Packing Wrapper\n\n\n\n" ) @@ -89,15 +89,14 @@ def flash_attn_func_wrapper( deterministic=False, return_attn_probs=False, ): - - if q.shape != k.shape: - raise ValueError("q and k must have the same shape") - # assert there are no padding tokens if we get here context = InfoContainer.get() assert (context.routing_infos.attention_mask == 1).all() # no padding tokens if context.routing_infos.packed_seq_lens is not None: + if q.shape != k.shape: + raise ValueError("q and k must have the same shape") + cu_seqlens_q = cu_seqlens_k = context.routing_infos.packed_seq_lens max_seqlen_q = max_seqlen_k = context.routing_infos.seq_lens.max().item() q, k, v = q.flatten(0, 1), k.flatten(0, 1), v.flatten(0, 1) From 09a5688dd2174752d9ad9033534162d3e6778937 Mon Sep 17 00:00:00 2001 From: Alessandro Sordoni Date: Tue, 3 Sep 2024 12:10:05 -0700 Subject: [PATCH 11/12] recompute flag --- mttl/evaluators/mixeval.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index da3e372e5..5da0bb9d9 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -1,7 +1,10 @@ import json import os +import shutil import threading +from mttl.models.base_model import BaseExpertModel + try: from mix_eval.api.registry import register_model from mix_eval.evaluate import compute_metrics_p, eval, parse_args @@ -28,7 +31,7 @@ @dataclass class MixEvalConfig: - batch_size: int = 16 + batch_size: int = 8 model_name: str = "mix_eval_expert_adapter" benchmark: str = "mixeval_hard" data_path: str = None @@ -84,7 +87,7 @@ def chunk_generate( def __init__(self, args): super().__init__(args) - self.model = self.model_context.model + self.model: BaseExpertModel = self.model_context.model self.tokenizer = get_tokenizer_with_args( model_name=self.model.base_model_name_or_path, model_family="gpt", @@ -123,7 +126,6 @@ def __init__(self, config: MixEvalConfig = None): self.download_data() def download_data(self): - import shutil import subprocess import mix_eval @@ -155,6 +157,7 @@ def evaluate( split=None, output_path=None, verbose=False, + recompute=False, **kwargs, ): from mix_eval.compute_metrics import AVAILABLE_MODELS @@ -173,6 +176,9 @@ def evaluate( else: raise ValueError("Output path is required for evaluation.") + if recompute: + shutil.rmtree(self.config.output_dir, ignore_errors=True) + eval(self.config) # for some reason, available models is filled by hand rather than by the decorator, /shrug @@ -185,9 +191,15 @@ def evaluate( if __name__ == "__main__": - evaluator = MixEvalEvaluator() - model = MultiExpertModel( - MultiExpertModelConfig(base_model="microsoft/Phi-3-mini-4k-instruct"), + from mttl.models.containers.selectors import ArrowSelector, ArrowSelectorConfig + from mttl.models.library.library_transforms import ArrowConfig, ArrowTransform + + model = MultiExpertModel.from_pretrained_library( + "sordonia/Phi-3.5-mini-instruct-28Aug", device_map="cuda:0", + attn_implementation="flash_attention_2", + selector_config=ArrowSelectorConfig(top_k=2), + ) + MixEvalEvaluator().evaluate( + model, output_path="/tmp/mixeval_phi_3.5_arrow/", verbose=True, recompute=True ) - evaluator.evaluate(model, output_path="/tmp/mixeval/") From 07b0e653ecf88df83ef46ad2b2e16eadfb991f99 Mon Sep 17 00:00:00 2001 From: zhansu Date: Mon, 23 Sep 2024 02:06:27 -0400 Subject: [PATCH 12/12] add base_url_api --- mttl/evaluators/mixeval.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mttl/evaluators/mixeval.py b/mttl/evaluators/mixeval.py index 5da0bb9d9..27e44a14d 100644 --- a/mttl/evaluators/mixeval.py +++ b/mttl/evaluators/mixeval.py @@ -194,12 +194,17 @@ def evaluate( from mttl.models.containers.selectors import ArrowSelector, ArrowSelectorConfig from mttl.models.library.library_transforms import ArrowConfig, ArrowTransform + if not os.getenv("MODEL_PARSER_API"): + raise RuntimeError("MODEL_PARSER_API is not set") + + mix_config = MixEvalConfig() + mix_config.api_base_url = "https://api.ai-gaochao.cn/v1" model = MultiExpertModel.from_pretrained_library( - "sordonia/Phi-3.5-mini-instruct-28Aug", + "zhan1993/private_library_phi3_flan_embedding_cluster10", device_map="cuda:0", attn_implementation="flash_attention_2", selector_config=ArrowSelectorConfig(top_k=2), ) - MixEvalEvaluator().evaluate( + MixEvalEvaluator(mix_config).evaluate( model, output_path="/tmp/mixeval_phi_3.5_arrow/", verbose=True, recompute=True )