From 0e5bd07382d811209aa4345d7cb263c82acf84f2 Mon Sep 17 00:00:00 2001
From: jean-mercat <jean.mercat@tri.global>
Date: Mon, 2 Jun 2025 18:43:25 -0700
Subject: [PATCH 1/6] fix a few benchmark such that importing any of them works
 properly

---
 eval/chat_benchmarks/LiveCodeBench/eval_instruct.py | 13 ++++++-------
 eval/chat_benchmarks/MBPP/eval_instruct.py          |  4 ++--
 eval/chat_benchmarks/MTBench/eval_instruct.py       |  4 ++--
 eval/chat_benchmarks/MultiPLE/eval_instruct.py      |  2 +-
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
index 4ec23369..ac07773c 100644
--- a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
@@ -31,11 +31,11 @@ def has_code(response):
 
 # Calculate mean and standard error for all metrics
 def calc_stats(values):
-    arr    = np.asarray(values, dtype=float)
-    mask   = ~np.isnan(arr)
-    if mask.sum() == 0:          # all NaNs → undefined; return 0,0
+    arr = np.asarray(values, dtype=float)
+    mask = ~np.isnan(arr)
+    if mask.sum() == 0:  # all NaNs → undefined; return 0,0
         return 0.0, 0.0
-    mean   = arr[mask].mean()
+    mean = arr[mask].mean()
     stderr = np.std(arr[mask], ddof=1) / np.sqrt(mask.sum())
     return mean, stderr
 
@@ -51,8 +51,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        self.max_new_tokens = max_tokens,
-        max_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -67,7 +66,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768 # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 6
 
diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py
index e2d99209..882f2a69 100644
--- a/eval/chat_benchmarks/MBPP/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPP/eval_instruct.py
@@ -25,7 +25,7 @@ def __init__(
         num_examples: int = 3,
         start_idx: int = 10,
         end_idx: int = 510,
-        debug: bool = False,        
+        debug: bool = False,
         max_tokens: Optional[int] = 512,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
@@ -45,7 +45,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = if max_tokens is not None else max_tokens
+        self.max_tokens = max_tokens
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
index f9820077..d075c900 100644
--- a/eval/chat_benchmarks/MTBench/eval_instruct.py
+++ b/eval/chat_benchmarks/MTBench/eval_instruct.py
@@ -71,7 +71,7 @@ def __init__(
         config: Optional[MTBenchConfig] = None,
         debug: bool = False,
         annotator_model: str = "gpt-4o-mini-2024-07-18",
-        max_tokens: Optional[int] = 1024
+        max_tokens: Optional[int] = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -87,13 +87,13 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.base_path = Path(base_path)
-        self.config.max_new_token = max_tokens if max_tokens is not None else 1024
         if annotator_model == "auto":
             annotator_model = "gpt-4"
         if config:
             print(f"Warning: Overwriting config.judge_model = {annotator_model} ")
             config.judge_model = annotator_model
         self.config = config or MTBenchConfig(judge_model=annotator_model)
+        self.config.max_new_token = max_tokens
         self.debug = debug
 
         # Setup paths
diff --git a/eval/chat_benchmarks/MultiPLE/eval_instruct.py b/eval/chat_benchmarks/MultiPLE/eval_instruct.py
index 3debe222..744fae11 100644
--- a/eval/chat_benchmarks/MultiPLE/eval_instruct.py
+++ b/eval/chat_benchmarks/MultiPLE/eval_instruct.py
@@ -9,7 +9,7 @@
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from multiple.evaluation import evaluate_functional_correctness
-from utils import extract_generation_code
+from .utils import extract_generation_code
 from eval.task import BaseBenchmark
 import traceback
 

From f3fe3619bd465997614d1ffba0c4067e577ea88f Mon Sep 17 00:00:00 2001
From: jean-mercat <jean.mercat@tri.global>
Date: Mon, 2 Jun 2025 18:45:58 -0700
Subject: [PATCH 2/6] ran black formatter

---
 eval/chat_benchmarks/AIME24/eval_instruct.py  |  4 +-
 eval/chat_benchmarks/AIME25/eval_instruct.py  |  2 +-
 eval/chat_benchmarks/AIW/eval_instruct.py     |  2 +-
 eval/chat_benchmarks/AMC23/eval_instruct.py   |  2 +-
 eval/chat_benchmarks/CodeElo/codeelo_utils.py | 22 ++++---
 eval/chat_benchmarks/CodeElo/eval_instruct.py |  6 +-
 .../CodeForces/codeforces_utils.py            | 17 ++---
 .../CodeForces/eval_instruct.py               |  4 +-
 eval/chat_benchmarks/HLE/eval_instruct.py     |  4 +-
 eval/chat_benchmarks/HLE/testing_utils.py     |  6 +-
 eval/chat_benchmarks/HMMT/eval_instruct.py    |  4 +-
 .../chat_benchmarks/JEEBench/eval_instruct.py |  4 +-
 .../LiveCodeBenchv5/eval_instruct.py          |  4 +-
 eval/chat_benchmarks/MATH500/eval_instruct.py |  4 +-
 eval/chat_benchmarks/MMLUPro/eval_instruct.py | 65 +++++++++----------
 .../RepoBench/eval_instruct.py                |  7 +-
 eval/distributed/benchmark_plot.py            |  2 +-
 eval/distributed/launch_local.py              | 63 ++++++++++--------
 eval/eval.py                                  | 12 +++-
 19 files changed, 131 insertions(+), 103 deletions(-)

diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py
index 3fbd9b82..7cbe5701 100644
--- a/eval/chat_benchmarks/AIME24/eval_instruct.py
+++ b/eval/chat_benchmarks/AIME24/eval_instruct.py
@@ -44,7 +44,9 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768    # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py
index d6ed5bd7..08d339bd 100644
--- a/eval/chat_benchmarks/AIME25/eval_instruct.py
+++ b/eval/chat_benchmarks/AIME25/eval_instruct.py
@@ -43,7 +43,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py
index 4a8d234f..96d8b04d 100644
--- a/eval/chat_benchmarks/AIW/eval_instruct.py
+++ b/eval/chat_benchmarks/AIW/eval_instruct.py
@@ -41,7 +41,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_trials = n_trials
 
diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
index 1a78e0bf..24f88e21 100644
--- a/eval/chat_benchmarks/AMC23/eval_instruct.py
+++ b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.n_repeat = 10
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/CodeElo/codeelo_utils.py b/eval/chat_benchmarks/CodeElo/codeelo_utils.py
index 7c35fe2f..054e5387 100644
--- a/eval/chat_benchmarks/CodeElo/codeelo_utils.py
+++ b/eval/chat_benchmarks/CodeElo/codeelo_utils.py
@@ -18,15 +18,17 @@
 
 import scipy.stats as stats
 
+
 def rating_to_difficulty(rating):
     if rating < 1000:
-        return 'Easy'
+        return "Easy"
     if rating < 1300:
-        return 'Medium'
+        return "Medium"
     if rating <= 3500:
-        return 'Hard'
+        return "Hard"
+
+    return "Easy"
 
-    return 'Easy'
 
 def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     """
@@ -250,11 +252,13 @@ def codeelo_run(problem, completion, timeout, is_extracted):
         outs = tc[1]
         testtype = "stdin"
 
-        test_cases.append({
-            "input": ins,
-            "output": outs,
-            "testtype": testtype,
-        })
+        test_cases.append(
+            {
+                "input": ins,
+                "output": outs,
+                "testtype": testtype,
+            }
+        )
 
     manager = multiprocessing.Manager()
     result = manager.list()
diff --git a/eval/chat_benchmarks/CodeElo/eval_instruct.py b/eval/chat_benchmarks/CodeElo/eval_instruct.py
index 9faf2377..a561e836 100644
--- a/eval/chat_benchmarks/CodeElo/eval_instruct.py
+++ b/eval/chat_benchmarks/CodeElo/eval_instruct.py
@@ -48,7 +48,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int]  = None,
+        max_tokens: Optional[int] = None,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -63,7 +63,9 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
diff --git a/eval/chat_benchmarks/CodeForces/codeforces_utils.py b/eval/chat_benchmarks/CodeForces/codeforces_utils.py
index 359eb84e..eb82a721 100644
--- a/eval/chat_benchmarks/CodeForces/codeforces_utils.py
+++ b/eval/chat_benchmarks/CodeForces/codeforces_utils.py
@@ -18,17 +18,19 @@
 
 import scipy.stats as stats
 
+
 def rating_to_difficulty(rating):
     if not rating:
-        return 'Easy'
+        return "Easy"
     if rating < 1000:
-        return 'Easy'
+        return "Easy"
     if rating < 1300:
-        return 'Medium'
+        return "Medium"
     if rating <= 3500:
-        return 'Hard'
+        return "Hard"
+
+    return "Easy"
 
-    return 'Easy'
 
 def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     """
@@ -155,8 +157,8 @@ def run_test_std(completion, test_input, test_output):
         sys.stdin = io.StringIO(test_input)
         try:
             exec(f'__name__ = "__main__"\n{completion}' if '__name__ == "__main__"' in completion else completion, {})
-            out = output.getvalue().strip().replace('\n',' ').replace('\r', '')
-            expected = test_output.strip().replace('\n', ' ').replace('\r', '')
+            out = output.getvalue().strip().replace("\n", " ").replace("\r", "")
+            expected = test_output.strip().replace("\n", " ").replace("\r", "")
 
             return out == expected, output.getvalue().strip()
         finally:
@@ -247,7 +249,6 @@ def run_tests_for_one_example(test_cases, completion, result_list, is_extracted)
             return
 
 
-
 def codeforces_run(problem, completion, timeout, is_extracted):
     test_cases = problem["official_tests"]
     test_cases = [{**x, "testtype": "stdin"} for x in test_cases]
diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py
index 75a77d75..30243392 100644
--- a/eval/chat_benchmarks/CodeForces/eval_instruct.py
+++ b/eval/chat_benchmarks/CodeForces/eval_instruct.py
@@ -62,9 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
diff --git a/eval/chat_benchmarks/HLE/eval_instruct.py b/eval/chat_benchmarks/HLE/eval_instruct.py
index 30b30885..8df8d9a6 100644
--- a/eval/chat_benchmarks/HLE/eval_instruct.py
+++ b/eval/chat_benchmarks/HLE/eval_instruct.py
@@ -77,7 +77,9 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/HLE/testing_utils.py b/eval/chat_benchmarks/HLE/testing_utils.py
index 1aff96e7..2ffd516e 100644
--- a/eval/chat_benchmarks/HLE/testing_utils.py
+++ b/eval/chat_benchmarks/HLE/testing_utils.py
@@ -7,11 +7,7 @@
 
 def get_multiple_choice_answer(pred: str):
     # Try to pull out “Answer: X”, “Answer: {X}” or “Answer: \boxed{X}”
-    m = re.search(
-        r"(?:Exact\s+)?Answer:\s*(?:\\boxed)?\{?([A-Z])\}?",
-        pred, 
-        re.IGNORECASE
-    )
+    m = re.search(r"(?:Exact\s+)?Answer:\s*(?:\\boxed)?\{?([A-Z])\}?", pred, re.IGNORECASE)
     if m:
         return m.group(1).upper()
 
diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py
index 7302dbed..32b46dfb 100644
--- a/eval/chat_benchmarks/HMMT/eval_instruct.py
+++ b/eval/chat_benchmarks/HMMT/eval_instruct.py
@@ -47,7 +47,9 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.dataset_name = dataset_name
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/JEEBench/eval_instruct.py b/eval/chat_benchmarks/JEEBench/eval_instruct.py
index df1c514d..5ba53541 100644
--- a/eval/chat_benchmarks/JEEBench/eval_instruct.py
+++ b/eval/chat_benchmarks/JEEBench/eval_instruct.py
@@ -92,7 +92,9 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
index 24d1ea4e..9ae5ee56 100644
--- a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
@@ -62,7 +62,9 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py
index 1eb64130..f082ce0f 100644
--- a/eval/chat_benchmarks/MATH500/eval_instruct.py
+++ b/eval/chat_benchmarks/MATH500/eval_instruct.py
@@ -45,7 +45,9 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
         """
diff --git a/eval/chat_benchmarks/MMLUPro/eval_instruct.py b/eval/chat_benchmarks/MMLUPro/eval_instruct.py
index 0b3d335f..8fdf68f7 100644
--- a/eval/chat_benchmarks/MMLUPro/eval_instruct.py
+++ b/eval/chat_benchmarks/MMLUPro/eval_instruct.py
@@ -16,6 +16,7 @@
 
 # --- Extraction helpers from https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/evaluate_from_local.py ---
 
+
 def extract_answer(text: str) -> Optional[str]:
     pattern = r"answer is \(?([A-J])\)?"
     match = re.search(pattern, text, re.IGNORECASE)
@@ -41,20 +42,20 @@ def extract_final(text: str) -> Optional[str]:
 
 # --- Prompt construction from Script 1 ---
 
-choices = [chr(ord('A') + i) for i in range(16)]
+choices = [chr(ord("A") + i) for i in range(16)]
 
 
-def select_by_category(df: List[Dict[str, Any]] , subject: str) -> List[Dict[str, Any]]:
-    return [ex for ex in df if ex['category'] == subject]
+def select_by_category(df: List[Dict[str, Any]], subject: str) -> List[Dict[str, Any]]:
+    return [ex for ex in df if ex["category"] == subject]
 
 
 def format_cot_example(example: Dict[str, Any], including_answer: bool = True) -> str:
-    prompt = "Question:\n" + example['question'] + "\n"
+    prompt = "Question:\n" + example["question"] + "\n"
     prompt += "Options:\n"
-    for i, opt in enumerate(example['options']):
+    for i, opt in enumerate(example["options"]):
         prompt += f"{choices[i]}. {opt}\n"
     if including_answer:
-        cot = example['cot_content'].replace("A: Let's think step by step.", "Answer: Let's think step by step.")
+        cot = example["cot_content"].replace("A: Let's think step by step.", "Answer: Let's think step by step.")
         prompt += cot + "\n\n"
     else:
         prompt += "Answer: Let's think step by step."
@@ -65,7 +66,7 @@ def generate_cot_prompt(val_df: List[Dict[str, Any]], curr: Dict[str, Any], k: i
     # Load base template
     with open("./eval/chat_benchmarks/MMLUPro/initial_prompt.txt") as f:
         base = f.read()
-    subject = curr['category']
+    subject = curr["category"]
     support = select_by_category(val_df, subject)[:k]
     prompt = base.replace("{$}", subject) + "\n"
     for ex in support:
@@ -77,19 +78,21 @@ def generate_cot_prompt(val_df: List[Dict[str, Any]], curr: Dict[str, Any], k: i
 def preprocess(df: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     out = []
     for ex in df:
-        opts = [o for o in ex['options'] if o != 'N/A']
-        ex['options'] = opts
+        opts = [o for o in ex["options"] if o != "N/A"]
+        ex["options"] = opts
         out.append(ex)
     return out
 
 
 # --- MMLUPro Benchmark with CoT prompting ---
 
+
 class MMLUProBenchmark(BaseBenchmark):
     """
     MMLU-Pro CoT Benchmark: harness-style but with dynamic few-shot CoT prompts
     and multi-stage regex answer extraction, reporting both overall and per-area accuracy.
     """
+
     def __init__(
         self,
         ntrain: int = 5,
@@ -109,8 +112,8 @@ def __init__(
         self.seed = seed
 
         ds = load_dataset(self.dataset_name)
-        self.test_examples = preprocess(ds['test'])
-        self.val_examples = preprocess(ds['validation'])
+        self.test_examples = preprocess(ds["test"])
+        self.val_examples = preprocess(ds["validation"])
 
         # prepare tokenizer for dynamic prompt length checks
         # model name will be set later in generate_responses
@@ -120,7 +123,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         # initialize tokenizer on first use
         if self.tokenizer is None:
             from transformers import AutoTokenizer
-            model_name = getattr(model, 'pretrained', getattr(model, 'model_args', {}).get('model'))
+
+            model_name = getattr(model, "pretrained", getattr(model, "model_args", {}).get("model"))
             self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True)
 
         instances = []
@@ -132,8 +136,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             k = self.ntrain
             while k > 0:
                 prompt = generate_cot_prompt(self.val_examples, ex, k)
-                toks = self.tokenizer(prompt, return_tensors='pt')
-                length = toks['input_ids'].shape[1]
+                toks = self.tokenizer(prompt, return_tensors="pt")
+                length = toks["input_ids"].shape[1]
                 if length < self.max_model_length - self.max_new_tokens:
                     break
                 k -= 1
@@ -142,12 +146,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             messages = [{"role": "user", "content": prompt}]
             templated = self._prepare_messages(messages, model)
             params = {"temperature": 0.0, "max_new_tokens": self.max_new_tokens, "seed": self.seed}
-            inst = Instance(
-                "generate_until",
-                ex,
-                (templated, params),
-                idx
-            )
+            inst = Instance("generate_until", ex, (templated, params), idx)
             instances.append(inst)
 
         outputs = self.compute(model, instances)
@@ -156,30 +155,29 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             # unwrap different output types
             if isinstance(out, str):
                 text = out
-            elif hasattr(out, 'outputs') and out.outputs:
+            elif hasattr(out, "outputs") and out.outputs:
                 text = out.outputs[0].text
-            elif hasattr(out, 'text'):
+            elif hasattr(out, "text"):
                 text = out.text
             else:
                 text = str(out)
 
             pred = extract_answer(text)
             ex_copy = ex.copy()
-            ex_copy['model_outputs'] = text
-            ex_copy['pred'] = pred
+            ex_copy["model_outputs"] = text
+            ex_copy["pred"] = pred
             examples.append(ex_copy)
 
         return {"examples": examples}
 
-
     def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]:
         if results is None:
             return None
-    
+
         examples: List[Dict[str, Any]] = results["examples"]
         area_stats = defaultdict(lambda: {"corr": 0, "total": 0})
-        correct_flags: List[int] = []          # collect 1/0 for each example
-    
+        correct_flags: List[int] = []  # collect 1/0 for each example
+
         # accumulate per‑example correctness
         for ex in examples:
             cat = ex["category"]
@@ -187,20 +185,20 @@ def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]:
             area_stats[cat]["total"] += 1
             area_stats[cat]["corr"] += correct
             correct_flags.append(correct)
-    
+
         n = len(correct_flags)
         flags_arr = np.asarray(correct_flags, dtype=float)
-    
+
         # micro accuracy and its **empirical** standard error
         overall_accuracy = float(flags_arr.mean())
         overall_accuracy_stderr = float(flags_arr.std(ddof=1) / math.sqrt(n))
-    
+
         out: Dict[str, float] = {
             "accuracy_avg": overall_accuracy,
             "accuracy_std_err": overall_accuracy_stderr,
             "total_examples": n,
         }
-    
+
         # per‑category stats (needed for macro‑averages)
         per_area_acc: List[float] = []
         for cat, vals in area_stats.items():
@@ -208,6 +206,5 @@ def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]:
             out[f"accuracy_{cat}"] = acc
             out[f"count_{cat}"] = vals["total"]
             per_area_acc.append(acc)
-    
-        return out
 
+        return out
diff --git a/eval/chat_benchmarks/RepoBench/eval_instruct.py b/eval/chat_benchmarks/RepoBench/eval_instruct.py
index d55f4379..bba271c9 100644
--- a/eval/chat_benchmarks/RepoBench/eval_instruct.py
+++ b/eval/chat_benchmarks/RepoBench/eval_instruct.py
@@ -94,7 +94,12 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                             example,
                             (
                                 prompt,
-                                {"max_new_tokens": self.max_tokens, "temperature": 0.2, "top_p": 0.95, "do_sample": True},
+                                {
+                                    "max_new_tokens": self.max_tokens,
+                                    "temperature": 0.2,
+                                    "top_p": 0.95,
+                                    "do_sample": True,
+                                },
                             ),
                             idx,
                         )
diff --git a/eval/distributed/benchmark_plot.py b/eval/distributed/benchmark_plot.py
index b373f42e..8942c0e3 100644
--- a/eval/distributed/benchmark_plot.py
+++ b/eval/distributed/benchmark_plot.py
@@ -1,5 +1,5 @@
 """
-SHARDS=N && cd $EVALCHEMY && source /leonardo_work/EUHPC_E03_068/DCFT_shared/mamba/bin/activate /leonardo_work/EUHPC_E03_068/DCFT_shared/evalchemy/env/cpu-evalchemy && python eval/distributed/launch.py --model_name open-thoughts/OpenThinker-7B --tasks LiveCodeBench,AIME24,AIME25,AMC23,GPQADiamond,MATH500 --num_shards $SHARDS --watchdog 
+SHARDS=N && cd $EVALCHEMY && source /leonardo_work/EUHPC_E03_068/DCFT_shared/mamba/bin/activate /leonardo_work/EUHPC_E03_068/DCFT_shared/evalchemy/env/cpu-evalchemy && python eval/distributed/launch.py --model_name open-thoughts/OpenThinker-7B --tasks LiveCodeBench,AIME24,AIME25,AMC23,GPQADiamond,MATH500 --num_shards $SHARDS --watchdog
 """
 
 import matplotlib.pyplot as plt
diff --git a/eval/distributed/launch_local.py b/eval/distributed/launch_local.py
index 704a9800..b2a7a25c 100644
--- a/eval/distributed/launch_local.py
+++ b/eval/distributed/launch_local.py
@@ -82,8 +82,8 @@ def check_required_env_vars(mode="auto"):
         hostname, _, _ = execute_command(cmd, verbose=False)
         is_local = not ("c1" in hostname or "leonardo" in hostname)
     else:
-        is_local = (mode == "local")
-    
+        is_local = mode == "local"
+
     required_vars = ["HF_TOKEN", "DB_PASSWORD", "DB_HOST", "DB_PORT", "DB_NAME", "DB_USER"]
     missing_vars = []
     for var in required_vars:
@@ -110,7 +110,7 @@ def check_required_env_vars(mode="auto"):
         print_info(f"No specific cluster detected, using default local HF_HUB_CACHE: {hf_hub_cache}")
         # Ensure the directory exists
         os.makedirs(hf_hub_cache, exist_ok=True)
-    
+
     current_hub_cache = os.environ.get("HF_HUB_CACHE")
     if current_hub_cache is not None and current_hub_cache != hf_hub_cache:
         print_warning(f"Overwriting existing HF_HUB_CACHE value '{current_hub_cache}' with '{hf_hub_cache}'")
@@ -144,7 +144,7 @@ def check_conda_env(mode="auto", watchdog=False):
         hostname, _, _ = execute_command(cmd, verbose=False)
         is_local = not ("c1" in hostname or "leonardo" in hostname)
     else:
-        is_local = (mode == "local")
+        is_local = mode == "local"
 
     # Check hostname to determine which conda environment we should be in
     cmd = "echo $HOSTNAME"
@@ -165,7 +165,7 @@ def check_conda_env(mode="auto", watchdog=False):
     else:
         # For local environments, we don't enforce a specific Python path
         print_info(f"Local environment detected, not enforcing specific conda environment")
-        
+
         # If watchdog is enabled, just check if Python is accessible
         if watchdog and is_local:
             cmd = "which python"
@@ -174,7 +174,7 @@ def check_conda_env(mode="auto", watchdog=False):
                 print_error("Python not found. Please make sure Python is installed and available in your PATH.")
                 return False
             print_info(f"Using Python from: {stdout}")
-        
+
         return True
 
     # we'll check if the environment exists
@@ -293,10 +293,12 @@ def launch_local(
         script_content = f.read()
 
     # Replace parameters in the script using regex pattern matching
-    script_content = re.sub(r"export GLOBAL_SIZE=.*", f'export GLOBAL_SIZE={num_shards}', script_content)
+    script_content = re.sub(r"export GLOBAL_SIZE=.*", f"export GLOBAL_SIZE={num_shards}", script_content)
     script_content = re.sub(r"export MODEL_NAME=.*", f'export MODEL_NAME="{model_path}"', script_content)
     script_content = re.sub(r"export INPUT_DATASET=.*", f'export INPUT_DATASET="{dataset_path}"', script_content)
-    script_content = re.sub(r"export OUTPUT_DATASET=.*", f'export OUTPUT_DATASET="{output_dataset_dir}"', script_content)
+    script_content = re.sub(
+        r"export OUTPUT_DATASET=.*", f'export OUTPUT_DATASET="{output_dataset_dir}"', script_content
+    )
 
     # Update the GPU range in the for loop based on num_shards
     gpu_range_pattern = r"for RANK in \{.*\}; do"
@@ -339,7 +341,7 @@ def launch_eval_sbatch(cmd, logs_dir):
     """Launch the sbatch job for evaluation step."""
     print_header("Launching SBATCH Job")
 
-    sbatch_script = "eval/distributed/run_evaluations_tacc.sbatch" 
+    sbatch_script = "eval/distributed/run_evaluations_tacc.sbatch"
     # Create a temporary sbatch script with the correct parameters
     temp_sbatch_file = os.path.join(logs_dir, "job.sbatch")
     with open(sbatch_script, "r") as f:
@@ -377,6 +379,7 @@ def launch_eval_sbatch(cmd, logs_dir):
 
     return job_id
 
+
 def launch_sbatch(
     model_path,
     dataset_path,
@@ -473,7 +476,7 @@ def monitor_local_job(job_id, logs_dir, num_shards, watchdog_interval_min=1):
 
     # Determine the log file pattern based on the job ID
     log_file = f"{logs_dir}/{job_id}.out"
-    
+
     # Make sure there's enough time for jobs to start
     time.sleep(5)
 
@@ -484,7 +487,7 @@ def monitor_local_job(job_id, logs_dir, num_shards, watchdog_interval_min=1):
             cmd = "ps aux | grep process_shard.py | grep -v grep | wc -l"
             stdout, _, _ = execute_command(cmd, verbose=False)
             running_count = int(stdout.strip())
-            
+
             # Count various progress indicators
             progress_metrics = [
                 ("Shards started", f'grep -c "processing shard" {log_file}'),
@@ -492,27 +495,29 @@ def monitor_local_job(job_id, logs_dir, num_shards, watchdog_interval_min=1):
                 ("Engines initialized", f'grep -c "init engine" {log_file}'),
                 ("Completed shards", f'grep -c "Shard successfully processed" {log_file}'),
             ]
-            
+
             results = {}
             for label, cmd in progress_metrics:
                 stdout, _, _ = execute_command(cmd, verbose=False)
                 count = int(stdout.strip()) if stdout.strip().isdigit() else 0
                 results[label] = count
-                
-            print_info(f"({counter*watchdog_interval_min}m) Local Job Status: {results['Completed shards']} completed, {running_count} processes running")
-            
+
+            print_info(
+                f"({counter*watchdog_interval_min}m) Local Job Status: {results['Completed shards']} completed, {running_count} processes running"
+            )
+
             for label, count in results.items():
                 percentage = (count / num_shards) * 100
                 print(f"  {label}: {count}/{num_shards} ({percentage:.1f}%)")
-                
+
             # Check if all shards are completed
-            if results['Completed shards'] >= num_shards or running_count == 0:
-                if results['Completed shards'] >= num_shards:
+            if results["Completed shards"] >= num_shards or running_count == 0:
+                if results["Completed shards"] >= num_shards:
                     print_success("All shards have been processed")
                 else:
                     print_warning("No processes running but not all shards completed")
                 break
-                
+
             # Wait before checking again
             time.sleep(watchdog_interval_min * 60)
             counter += 1
@@ -526,7 +531,7 @@ def monitor_job(job_id, logs_dir, num_shards, watchdog_interval_min=1):
     # Check if this is a local job
     if job_id.startswith("local_"):
         return monitor_local_job(job_id, logs_dir, num_shards, watchdog_interval_min)
-        
+
     print_header("Monitoring Job Progress")
 
     # Determine the log file pattern based on the job ID
@@ -616,20 +621,20 @@ def monitor_job(job_id, logs_dir, num_shards, watchdog_interval_min=1):
 def check_local_job_completion(job_id, output_dir=None):
     """Check if a local job completed successfully."""
     print_header("Checking Local Job Completion")
-    
+
     # Check if the output directory contains parquet files
     if output_dir:
         cmd = f"ls -1 {output_dir}/*.parquet 2>/dev/null | wc -l"
         stdout, _, _ = execute_command(cmd)
         file_count = int(stdout.strip())
         print_info(f"Found {file_count} parquet files in {output_dir}")
-        
+
         # Check the log file for errors
         log_file = f"logs/{output_dir.split('/')[-1]}/{job_id}.out"
         cmd = f"grep -c 'ERROR' {log_file}"
         stdout, _, _ = execute_command(cmd)
         error_count = int(stdout.strip()) if stdout.strip().isdigit() else 0
-        
+
         if error_count > 0:
             print_warning(f"Found {error_count} errors in log file")
             # Show a sample of errors
@@ -639,11 +644,11 @@ def check_local_job_completion(job_id, output_dir=None):
                 print_warning("Sample errors:")
                 for line in stdout.strip().split("\n"):
                     print_warning(f"  {line}")
-        
+
         # Return true if we have a reasonable number of parquet files
         # This is a heuristic - adjust based on your needs
         return file_count > 0
-    
+
     return False
 
 
@@ -652,7 +657,7 @@ def check_job_completion(job_id, output_dir=None):
     # Check if this is a local job
     if job_id.startswith("local_"):
         return check_local_job_completion(job_id, output_dir)
-    
+
     print_header("Checking Job Completion")
 
     # Define job states
@@ -957,7 +962,9 @@ def main():
     local_num_shards = args.num_shards
     if processing_mode == "local":
         if local_num_shards > args.num_gpus:
-            print_warning(f"Limiting number of shards to {args.num_gpus} for local processing (number of available GPUs)")
+            print_warning(
+                f"Limiting number of shards to {args.num_gpus} for local processing (number of available GPUs)"
+            )
             local_num_shards = args.num_gpus
 
     # Launch job with the dataset repo but save to output repo
@@ -981,7 +988,7 @@ def main():
             args.max_job_duration,
             args.tp4,
         )
-    
+
     if not job_id:
         sys.exit(1)
 
diff --git a/eval/eval.py b/eval/eval.py
index 4a26153d..2e703480 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -32,7 +32,9 @@
 from eval.task import TaskManager as InstructTaskManager
 
 
-_BIT_CAP = 15_000 
+_BIT_CAP = 15_000
+
+
 def handle_non_serializable_extended(o):
     """
     Delegates to the stock helper, but for gigantic SymPy Integer /
@@ -40,11 +42,12 @@ def handle_non_serializable_extended(o):
     """
     try:
         from sympy import Integer, Rational
+
         if isinstance(o, Integer):
             if o.p.bit_length() > _BIT_CAP:
                 digits = int(o.p.bit_length() * math.log10(2)) + 1
                 return f"<Integer ~{digits} digits>"
-            return str(int(o)) # safe: fits under the guard
+            return str(int(o))  # safe: fits under the guard
 
         if isinstance(o, Rational):
             num_bits = o.p.bit_length()
@@ -60,6 +63,7 @@ def handle_non_serializable_extended(o):
     # Everything else: NumPy ints, sets, etc.
     return _orig_handle(o)
 
+
 def setup_custom_parser():
     """
     Create a custom argument parser that extends lm-eval-harness parser.
@@ -118,7 +122,9 @@ def setup_custom_parser():
     )
 
     parser.add_argument(
-        "--config", type=str, help="Path to config yaml. Overwrites --batch_size, --tasks, --annotator_model, and --max_tokens"
+        "--config",
+        type=str,
+        help="Path to config yaml. Overwrites --batch_size, --tasks, --annotator_model, and --max_tokens",
     )
     parser.add_argument(
         "--debug",

From 61f42c97c37fbc855068c4680d183bc05b367847 Mon Sep 17 00:00:00 2001
From: jean-mercat <jean.mercat@tri.global>
Date: Mon, 2 Jun 2025 18:47:39 -0700
Subject: [PATCH 3/6] ran black[colorama]==23.1.0 formatting

---
 eval/chat_benchmarks/BigCodeBench/execution.py              | 2 --
 eval/chat_benchmarks/CruxEval/evaluation.py                 | 1 -
 eval/chat_benchmarks/CruxEval/execution.py                  | 1 -
 eval/chat_benchmarks/HMMT/matharena/api.py                  | 1 -
 eval/chat_benchmarks/HMMT/matharena/parser.py               | 1 -
 eval/chat_benchmarks/HumanEval/human_eval/evaluation.py     | 1 -
 eval/chat_benchmarks/HumanEval/human_eval/execution.py      | 2 --
 eval/chat_benchmarks/HumanEval/utils/dataset.py             | 1 -
 .../HumanEvalPlus/human_eval_plus/evaluation.py             | 1 -
 .../HumanEvalPlus/human_eval_plus/execution.py              | 2 --
 eval/chat_benchmarks/HumanEvalPlus/utils/dataset.py         | 1 -
 eval/chat_benchmarks/IFEval/instructions_test.py            | 1 -
 .../LiveBench/livebench/download_leaderboard.py             | 2 --
 eval/chat_benchmarks/LiveBench/livebench/gen_api_answer.py  | 1 -
 .../instruction_following_eval/instructions_test.py         | 1 -
 .../instruction_following_eval/instructions_util_test.py    | 1 -
 .../livebench/lcb_runner/evaluation/testing_util.py         | 3 ---
 .../LiveBench/livebench/model/completions.py                | 1 -
 .../LiveBench/livebench/process_results/coding/utils.py     | 1 -
 .../livebench/process_results/data_analysis/cta/utils.py    | 1 -
 .../livebench/process_results/math/AMPS_Hard/utils.py       | 2 --
 .../process_results/math/math_competitions/utils.py         | 1 -
 .../livebench/process_results/reasoning/spatial/utils.py    | 1 -
 .../process_results/reasoning/web_of_lies_v2/utils.py       | 1 -
 .../livebench/process_results/writing/connections/utils.py  | 3 ---
 .../livebench/process_results/writing/typos/utils.py        | 2 --
 .../LiveBench/livebench/scripts/code_question_to_csv.py     | 1 -
 .../LiveBench/livebench/show_livebench_result.py            | 1 -
 eval/chat_benchmarks/MBPP/human_eval/evaluation.py          | 1 -
 eval/chat_benchmarks/MBPP/human_eval/execution.py           | 2 --
 eval/chat_benchmarks/MBPP/utils/dataset.py                  | 1 -
 eval/chat_benchmarks/MBPPPlus/mbpp_plus/evaluation.py       | 1 -
 eval/chat_benchmarks/MBPPPlus/mbpp_plus/execution.py        | 2 --
 eval/chat_benchmarks/MBPPPlus/utils/dataset.py              | 1 -
 .../MixEval/mix_eval/prompts/evaluation_prompts.py          | 1 -
 .../chat_benchmarks/MultiPLE/multiple/containerized_eval.py | 1 -
 eval/chat_benchmarks/MultiPLE/multiple/eval_java.py         | 1 -
 eval/chat_benchmarks/MultiPLE/multiple/evaluation.py        | 1 -
 eval/chat_benchmarks/MultiPLE/utils.py                      | 1 -
 eval/chat_benchmarks/RepoBench/data/data/utils.py           | 1 -
 eval/chat_benchmarks/RepoBench/data/utils.py                | 1 -
 eval/chat_benchmarks/RepoBench/eval.py                      | 2 --
 eval/chat_benchmarks/RepoBench/run.py                       | 1 -
 .../WildBench/leaderboard/data_dir/_create_tables.py        | 1 -
 eval/chat_benchmarks/WildBench/src/eval.py                  | 1 -
 eval/chat_benchmarks/WildBench/src/fastchat_conversation.py | 1 -
 eval/chat_benchmarks/WildBench/src/hf_models.py             | 2 --
 eval/chat_benchmarks/WildBench/src/view_wb_eval.py          | 1 -
 .../alpaca_eval/docs/format_export_leaderboards.py          | 6 +++---
 eval/chat_benchmarks/zeroeval/src/evaluation/crux_eval.py   | 1 -
 eval/chat_benchmarks/zeroeval/src/evaluation/math_eval.py   | 1 -
 eval/chat_benchmarks/zeroeval/src/fastchat_conversation.py  | 1 -
 eval/chat_benchmarks/zeroeval/src/hf_models.py              | 4 ----
 eval/chat_benchmarks/zeroeval/src/unified_infer.py          | 1 -
 eval/eval.py                                                | 4 +++-
 55 files changed, 6 insertions(+), 74 deletions(-)

diff --git a/eval/chat_benchmarks/BigCodeBench/execution.py b/eval/chat_benchmarks/BigCodeBench/execution.py
index 87669153..4fde763d 100644
--- a/eval/chat_benchmarks/BigCodeBench/execution.py
+++ b/eval/chat_benchmarks/BigCodeBench/execution.py
@@ -60,9 +60,7 @@ def check_correctness(
     def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
-
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil
diff --git a/eval/chat_benchmarks/CruxEval/evaluation.py b/eval/chat_benchmarks/CruxEval/evaluation.py
index 89ff5412..45eb2ce9 100644
--- a/eval/chat_benchmarks/CruxEval/evaluation.py
+++ b/eval/chat_benchmarks/CruxEval/evaluation.py
@@ -72,7 +72,6 @@ def evaluate_generations(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0
diff --git a/eval/chat_benchmarks/CruxEval/execution.py b/eval/chat_benchmarks/CruxEval/execution.py
index 5ff992f7..6ca398f7 100644
--- a/eval/chat_benchmarks/CruxEval/execution.py
+++ b/eval/chat_benchmarks/CruxEval/execution.py
@@ -38,7 +38,6 @@ def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil
diff --git a/eval/chat_benchmarks/HMMT/matharena/api.py b/eval/chat_benchmarks/HMMT/matharena/api.py
index 70612f88..00870241 100644
--- a/eval/chat_benchmarks/HMMT/matharena/api.py
+++ b/eval/chat_benchmarks/HMMT/matharena/api.py
@@ -48,7 +48,6 @@ def __init__(
         openai_responses=False,
         **kwargs,
     ):
-
         # if "think" in model and api == "google":
         #     logger.info("Google Think model does not allow chat.")
         #     is_chat = False # think model cannot handle chat
diff --git a/eval/chat_benchmarks/HMMT/matharena/parser.py b/eval/chat_benchmarks/HMMT/matharena/parser.py
index 28800120..059d9a5c 100644
--- a/eval/chat_benchmarks/HMMT/matharena/parser.py
+++ b/eval/chat_benchmarks/HMMT/matharena/parser.py
@@ -366,7 +366,6 @@ def parse(cls, string, primitive_type):
                     break
 
             for _ in range(5):
-
                 init_str = latex_str
                 latex_str = re.sub(r"\{(\d+)\}", r"(\1)", latex_str)
                 latex_str = re.sub(r"\\*(?:dfrac|tfrac|frac)\{([^{}]*)\}\{([^{}]*)\}", r"(\1)/(\2)", latex_str)
diff --git a/eval/chat_benchmarks/HumanEval/human_eval/evaluation.py b/eval/chat_benchmarks/HumanEval/human_eval/evaluation.py
index 148b4ea5..7d9e36fe 100644
--- a/eval/chat_benchmarks/HumanEval/human_eval/evaluation.py
+++ b/eval/chat_benchmarks/HumanEval/human_eval/evaluation.py
@@ -226,7 +226,6 @@ def evaluate_functional_correctness(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0
diff --git a/eval/chat_benchmarks/HumanEval/human_eval/execution.py b/eval/chat_benchmarks/HumanEval/human_eval/execution.py
index 2898c759..46040375 100644
--- a/eval/chat_benchmarks/HumanEval/human_eval/execution.py
+++ b/eval/chat_benchmarks/HumanEval/human_eval/execution.py
@@ -38,7 +38,6 @@ def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil
@@ -493,7 +492,6 @@ def unsafe_execute(tmp_dir):
 
             # 0 means success
             if returned_val_compilation == 0:
-
                 # Execution pipeline
                 cargo_test: str = "cargo test --bin " + file_prefix + " --message-format json >> " + log_path
                 returned_val_execution = os.system(cargo_test)
diff --git a/eval/chat_benchmarks/HumanEval/utils/dataset.py b/eval/chat_benchmarks/HumanEval/utils/dataset.py
index 4ddd453e..1ee21d61 100644
--- a/eval/chat_benchmarks/HumanEval/utils/dataset.py
+++ b/eval/chat_benchmarks/HumanEval/utils/dataset.py
@@ -4,7 +4,6 @@
 
 
 class HumanEvalDataset:
-
     def __init__(self, root, sample_num=1, language="python", issft=False):
         """
         root: the path to the HumanEval dataset
diff --git a/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/evaluation.py b/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/evaluation.py
index 148b4ea5..7d9e36fe 100644
--- a/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/evaluation.py
+++ b/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/evaluation.py
@@ -226,7 +226,6 @@ def evaluate_functional_correctness(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0
diff --git a/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/execution.py b/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/execution.py
index 2898c759..46040375 100644
--- a/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/execution.py
+++ b/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/execution.py
@@ -38,7 +38,6 @@ def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil
@@ -493,7 +492,6 @@ def unsafe_execute(tmp_dir):
 
             # 0 means success
             if returned_val_compilation == 0:
-
                 # Execution pipeline
                 cargo_test: str = "cargo test --bin " + file_prefix + " --message-format json >> " + log_path
                 returned_val_execution = os.system(cargo_test)
diff --git a/eval/chat_benchmarks/HumanEvalPlus/utils/dataset.py b/eval/chat_benchmarks/HumanEvalPlus/utils/dataset.py
index 71fe326f..2b6d1922 100644
--- a/eval/chat_benchmarks/HumanEvalPlus/utils/dataset.py
+++ b/eval/chat_benchmarks/HumanEvalPlus/utils/dataset.py
@@ -4,7 +4,6 @@
 
 
 class HumanEvalPlusDataset:
-
     def __init__(self, root, sample_num=1, language="python", issft=False):
         """
         root: the path to the HumanEvalPlus dataset
diff --git a/eval/chat_benchmarks/IFEval/instructions_test.py b/eval/chat_benchmarks/IFEval/instructions_test.py
index 6e9a11b9..d2760608 100644
--- a/eval/chat_benchmarks/IFEval/instructions_test.py
+++ b/eval/chat_benchmarks/IFEval/instructions_test.py
@@ -24,7 +24,6 @@
 
 # pylint:disable=g-complex-comprehension
 class InstructionsTest(parameterized.TestCase):
-
     @parameterized.named_parameters(
         [
             {
diff --git a/eval/chat_benchmarks/LiveBench/livebench/download_leaderboard.py b/eval/chat_benchmarks/LiveBench/livebench/download_leaderboard.py
index 554ee5cd..7063e55a 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/download_leaderboard.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/download_leaderboard.py
@@ -11,7 +11,6 @@
 model_answer, model_judgment = load_answers_judgments()
 
 for dir_name, dataset in [("model_answer", model_answer), ("model_judgment", model_judgment)]:
-
     categories, tasks = get_categories_tasks(LIVE_BENCH_DATA_SUPER_PATH)
 
     for category_name, task_names in tqdm(tasks.items()):
@@ -19,7 +18,6 @@
         for task_name in task_names:
             rows_task = [r for r in rows if r["task"] == task_name]
             if dir_name == "model_judgment":
-
                 task_path = f"data/{LIVE_BENCH_DATA_SUPER_PATH}/{category_name}/{task_name}/{dir_name}"
                 file_path = f"{task_path}/ground_truth_judgment.jsonl"
 
diff --git a/eval/chat_benchmarks/LiveBench/livebench/gen_api_answer.py b/eval/chat_benchmarks/LiveBench/livebench/gen_api_answer.py
index 9cb34f34..c96d56e8 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/gen_api_answer.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/gen_api_answer.py
@@ -125,7 +125,6 @@ def run_questions(
         if len(questions) > 0:
             reorg_answer_file(answer_file)
     else:
-
         with concurrent.futures.ThreadPoolExecutor(max_workers=parallel) as executor:
             futures = []
             for question in questions:
diff --git a/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_test.py b/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_test.py
index 189737f6..157e16dc 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_test.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_test.py
@@ -22,7 +22,6 @@
 
 # pylint:disable=g-complex-comprehension
 class InstructionsTest(parameterized.TestCase):
-
     @parameterized.named_parameters(
         [
             {
diff --git a/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_util_test.py b/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_util_test.py
index 58907b8e..bc3875c5 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_util_test.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/instructions_util_test.py
@@ -21,7 +21,6 @@
 
 
 class InstructionsUtilTest(parameterized.TestCase):
-
     TEST_WORD_COUNT_CASE_1 = ("word1, word2, word3, word4.", 4)
 
     TEST_WORD_COUNT_CASE_2 = (
diff --git a/eval/chat_benchmarks/LiveBench/livebench/lcb_runner/evaluation/testing_util.py b/eval/chat_benchmarks/LiveBench/livebench/lcb_runner/evaluation/testing_util.py
index a3c7d965..bdaff329 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/lcb_runner/evaluation/testing_util.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/lcb_runner/evaluation/testing_util.py
@@ -116,7 +116,6 @@ def run_test(sample, test=None, debug=False, timeout=6):
             print(f"loading test code = {datetime.now().time()}")
 
         if which_type == CODE_TYPE.call_based:
-
             sol += test
             if debug:
                 print(f"sol = {sol}")
@@ -594,7 +593,6 @@ def run_test(sample, test=None, debug=False, timeout=6):
 
 
 def custom_compare_(output, ground_truth):
-
     if isinstance(output, list):
         output_1 = "\n".join(output)
         if stripped_string_compare(output_1, ground_truth):
@@ -616,7 +614,6 @@ def stripped_string_compare(s1, s2):
 
 
 def call_method(method, inputs):
-
     if isinstance(inputs, list):
         inputs = "\n".join(inputs)
 
diff --git a/eval/chat_benchmarks/LiveBench/livebench/model/completions.py b/eval/chat_benchmarks/LiveBench/livebench/model/completions.py
index 931a8cb9..949c15f5 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/model/completions.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/model/completions.py
@@ -53,7 +53,6 @@ def chat_completion_openai(model: "Model", conv, temperature, max_tokens, api_di
             if message["role"] == "system":
                 message["role"] = "developer"
     try:
-
         response = client.chat.completions.create(
             model=model.api_name,
             messages=messages,
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/coding/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/coding/utils.py
index aac9766b..8342611d 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/coding/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/coding/utils.py
@@ -34,7 +34,6 @@ def __post_init__(self):
 
 
 def LCB_generation_process_results(question: dict, llm_answer: str, debug=False) -> int:
-
     llm_answer = extract_code(
         model_output=llm_answer, lmstyle=None
     )  # Missing out only on some slightly different handling for CodeLlamaInstruct from the original LiveCodeBench
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/data_analysis/cta/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/data_analysis/cta/utils.py
index 707cb8f6..6a8ac8cb 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/data_analysis/cta/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/data_analysis/cta/utils.py
@@ -9,7 +9,6 @@ def clean_text(text):
 
 
 def cta_process_results(ground_truth: str, llm_answer: str, debug=False) -> int:
-
     parsed_answer = llm_answer
 
     if "\\boxed{" in parsed_answer:
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/math/AMPS_Hard/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/math/AMPS_Hard/utils.py
index 05cee705..b50cb6f8 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/math/AMPS_Hard/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/math/AMPS_Hard/utils.py
@@ -158,7 +158,6 @@ def is_equiv(x1: str, x2: str) -> bool:
     x1 and x2 are normalized latex string
     """
     try:
-
         parsed_x1s = parse(x1)
         parsed_x2s = parse(x2)
 
@@ -168,7 +167,6 @@ def is_equiv(x1: str, x2: str) -> bool:
         errors = []
         for parsed_x1 in parsed_x1s:
             for parsed_x2 in parsed_x2s:
-
                 try:
                     diff = parsed_x1 - parsed_x2
                 except Exception as e:
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/math/math_competitions/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/math/math_competitions/utils.py
index 9762581f..fdedcb1e 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/math/math_competitions/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/math/math_competitions/utils.py
@@ -46,7 +46,6 @@ def mathcontest_process_results(ground_truth: str, llm_answer: str, question_tex
 
 
 def extract_answer(statement, letter):
-
     pattern = r"\\textbf{\(([A-E])\)\s?}(.*?)(?:\\qquad|\$)"
     matches = re.findall(pattern, statement)
     answers = {match[0]: match[1].strip() for match in matches}
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/spatial/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/spatial/utils.py
index 902ba004..95e06866 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/spatial/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/spatial/utils.py
@@ -3,7 +3,6 @@
 
 
 def spatial_process_results(ground_truth: str, llm_answer: str, debug=False) -> int:
-
     word_to_number = {
         "zero": "0",
         "one": "1",
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/web_of_lies_v2/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/web_of_lies_v2/utils.py
index d94f738c..9d8d085e 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/web_of_lies_v2/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/reasoning/web_of_lies_v2/utils.py
@@ -4,7 +4,6 @@
 
 
 def web_of_lies_process_results(ground_truth: str, llm_answer: str, debug=False) -> int:
-
     score = 0
     parsed_answer = None
     # pull out words in bold
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/connections/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/connections/utils.py
index 5c2af0c9..d568099f 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/connections/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/connections/utils.py
@@ -13,7 +13,6 @@ def group_words(words):
 
 
 def connections_process_results_old(ground_truth: str, llm_answer: str, debug=False) -> int:
-
     # pull out words in bold
     bold_words = re.findall(r"\*\*(.*?)\*\*", llm_answer.replace("\n", ""))
 
@@ -28,7 +27,6 @@ def connections_process_results_old(ground_truth: str, llm_answer: str, debug=Fa
     ground_truth_groups = group_words(ground_truth.split(","))
     max_score = 0
     for output_groups in list(map(group_words, bold_words)):
-
         correct_groups = 0
         for ground_truth_group in ground_truth_groups:
             for output_group in output_groups:
@@ -46,7 +44,6 @@ def connections_process_results_old(ground_truth: str, llm_answer: str, debug=Fa
 
 
 def connections_process_results(ground_truth: str, llm_answer: str, debug=False) -> int:
-
     # extract text from <solution></solution> tags
     solution_matches = re.findall(r"<solution>(.*?)<\/solution>", llm_answer)
     if len(solution_matches) == 0:
diff --git a/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/typos/utils.py b/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/typos/utils.py
index 1522ab9e..80c80927 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/typos/utils.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/process_results/writing/typos/utils.py
@@ -9,13 +9,11 @@ def extract_answer(llm_answer):
 
 
 def typos_process_results(ground_truth: str, llm_answer: str, debug=False) -> int:
-
     llm_answer = " ".join(list(filter(None, llm_answer.split("\n"))))
 
     llm_answer = extract_answer(llm_answer)
 
     if debug and ground_truth not in llm_answer:
-
         a = ground_truth
         b = llm_answer
         m = difflib.SequenceMatcher(a=a, b=b)
diff --git a/eval/chat_benchmarks/LiveBench/livebench/scripts/code_question_to_csv.py b/eval/chat_benchmarks/LiveBench/livebench/scripts/code_question_to_csv.py
index da95eeda..c4ae41aa 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/scripts/code_question_to_csv.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/scripts/code_question_to_csv.py
@@ -15,7 +15,6 @@ def jsonl_to_csv(input_filename, output_filename, task):
         open(input_filename, "r", encoding="utf-8") as jsonl_file,
         open(output_filename, "w", encoding="utf-8", newline="") as csv_file,
     ):
-
         # Define the CSV writer and write the header
         csv_writer = csv.writer(csv_file)
         header = ["question_id", "citation", "prompt"]
diff --git a/eval/chat_benchmarks/LiveBench/livebench/show_livebench_result.py b/eval/chat_benchmarks/LiveBench/livebench/show_livebench_result.py
index 53b63a57..5e1fb920 100644
--- a/eval/chat_benchmarks/LiveBench/livebench/show_livebench_result.py
+++ b/eval/chat_benchmarks/LiveBench/livebench/show_livebench_result.py
@@ -13,7 +13,6 @@
 
 
 def display_result_single(args):
-
     if args.livebench_release_option not in LIVE_BENCH_RELEASES:
         raise ValueError(f"Bad release {args.livebench_release_option}.")
     print(f"Using release {args.livebench_release_option}")
diff --git a/eval/chat_benchmarks/MBPP/human_eval/evaluation.py b/eval/chat_benchmarks/MBPP/human_eval/evaluation.py
index ba7e3475..ffe2befa 100644
--- a/eval/chat_benchmarks/MBPP/human_eval/evaluation.py
+++ b/eval/chat_benchmarks/MBPP/human_eval/evaluation.py
@@ -225,7 +225,6 @@ def evaluate_functional_correctness(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0
diff --git a/eval/chat_benchmarks/MBPP/human_eval/execution.py b/eval/chat_benchmarks/MBPP/human_eval/execution.py
index 2898c759..46040375 100644
--- a/eval/chat_benchmarks/MBPP/human_eval/execution.py
+++ b/eval/chat_benchmarks/MBPP/human_eval/execution.py
@@ -38,7 +38,6 @@ def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil
@@ -493,7 +492,6 @@ def unsafe_execute(tmp_dir):
 
             # 0 means success
             if returned_val_compilation == 0:
-
                 # Execution pipeline
                 cargo_test: str = "cargo test --bin " + file_prefix + " --message-format json >> " + log_path
                 returned_val_execution = os.system(cargo_test)
diff --git a/eval/chat_benchmarks/MBPP/utils/dataset.py b/eval/chat_benchmarks/MBPP/utils/dataset.py
index 966974c5..0d8baa8f 100644
--- a/eval/chat_benchmarks/MBPP/utils/dataset.py
+++ b/eval/chat_benchmarks/MBPP/utils/dataset.py
@@ -4,7 +4,6 @@
 
 
 class MBPPDataset:
-
     def __init__(self, root, samplenum=1):
         """
         root: 数据文件的根目录
diff --git a/eval/chat_benchmarks/MBPPPlus/mbpp_plus/evaluation.py b/eval/chat_benchmarks/MBPPPlus/mbpp_plus/evaluation.py
index 148b4ea5..7d9e36fe 100644
--- a/eval/chat_benchmarks/MBPPPlus/mbpp_plus/evaluation.py
+++ b/eval/chat_benchmarks/MBPPPlus/mbpp_plus/evaluation.py
@@ -226,7 +226,6 @@ def evaluate_functional_correctness(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0
diff --git a/eval/chat_benchmarks/MBPPPlus/mbpp_plus/execution.py b/eval/chat_benchmarks/MBPPPlus/mbpp_plus/execution.py
index 2898c759..46040375 100644
--- a/eval/chat_benchmarks/MBPPPlus/mbpp_plus/execution.py
+++ b/eval/chat_benchmarks/MBPPPlus/mbpp_plus/execution.py
@@ -38,7 +38,6 @@ def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil
@@ -493,7 +492,6 @@ def unsafe_execute(tmp_dir):
 
             # 0 means success
             if returned_val_compilation == 0:
-
                 # Execution pipeline
                 cargo_test: str = "cargo test --bin " + file_prefix + " --message-format json >> " + log_path
                 returned_val_execution = os.system(cargo_test)
diff --git a/eval/chat_benchmarks/MBPPPlus/utils/dataset.py b/eval/chat_benchmarks/MBPPPlus/utils/dataset.py
index 47759193..5019d1cb 100644
--- a/eval/chat_benchmarks/MBPPPlus/utils/dataset.py
+++ b/eval/chat_benchmarks/MBPPPlus/utils/dataset.py
@@ -4,7 +4,6 @@
 
 
 class MBPPPlusDataset:
-
     def __init__(self, root, sample_num=1, language="python", issft=False):
         """
         root: the path to the MBPPPlus dataset
diff --git a/eval/chat_benchmarks/MixEval/mix_eval/prompts/evaluation_prompts.py b/eval/chat_benchmarks/MixEval/mix_eval/prompts/evaluation_prompts.py
index 3d597753..c7ad3014 100644
--- a/eval/chat_benchmarks/MixEval/mix_eval/prompts/evaluation_prompts.py
+++ b/eval/chat_benchmarks/MixEval/mix_eval/prompts/evaluation_prompts.py
@@ -170,7 +170,6 @@ def construct_prompt_freeform(entry):
 
 
 if __name__ == "__main__":
-
     # mp_input = {'context': "How to check your Facebook feed", 'prompt': "Which solution is correct?", 'options': ["Log in to Facebook. Click on the bell shaped button at the top right of your Facebook home window.", "Log in to Facebook. Click on the bell shaped button at the top left of your Facebook home window."]}
     ff_input = {
         "context": "According to some sources 363 civilians were killed in Kavadarci, 230 in Negotino and 40 in Vatasha.",
diff --git a/eval/chat_benchmarks/MultiPLE/multiple/containerized_eval.py b/eval/chat_benchmarks/MultiPLE/multiple/containerized_eval.py
index 5b519340..fd26b6ef 100644
--- a/eval/chat_benchmarks/MultiPLE/multiple/containerized_eval.py
+++ b/eval/chat_benchmarks/MultiPLE/multiple/containerized_eval.py
@@ -87,7 +87,6 @@ def eval_string_script(language: str, program: str, tmpdir):
         suffix=file_ext,
         delete=True,
     ) as f:
-
         f.write(program.encode("utf-8"))
         f.flush()
         result = eval_script(Path(f.name))
diff --git a/eval/chat_benchmarks/MultiPLE/multiple/eval_java.py b/eval/chat_benchmarks/MultiPLE/multiple/eval_java.py
index 51eb64e0..601de3db 100644
--- a/eval/chat_benchmarks/MultiPLE/multiple/eval_java.py
+++ b/eval/chat_benchmarks/MultiPLE/multiple/eval_java.py
@@ -14,7 +14,6 @@
 
 
 def eval_script(path: Path):
-
     sys_env = os.environ.copy()
     javatuples_path = Path("/usr/multiple/javatuples-1.2.jar")
 
diff --git a/eval/chat_benchmarks/MultiPLE/multiple/evaluation.py b/eval/chat_benchmarks/MultiPLE/multiple/evaluation.py
index 78ed7b9f..e565e718 100644
--- a/eval/chat_benchmarks/MultiPLE/multiple/evaluation.py
+++ b/eval/chat_benchmarks/MultiPLE/multiple/evaluation.py
@@ -253,7 +253,6 @@ def evaluate_functional_correctness(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0
diff --git a/eval/chat_benchmarks/MultiPLE/utils.py b/eval/chat_benchmarks/MultiPLE/utils.py
index 7bf38bff..0a05c8f1 100644
--- a/eval/chat_benchmarks/MultiPLE/utils.py
+++ b/eval/chat_benchmarks/MultiPLE/utils.py
@@ -86,7 +86,6 @@
 
 
 def get_function_name(question: str, lang: str):
-
     if question.startswith("<?php"):
         question = question[5:]
 
diff --git a/eval/chat_benchmarks/RepoBench/data/data/utils.py b/eval/chat_benchmarks/RepoBench/data/data/utils.py
index e5c3d926..dd298e9b 100644
--- a/eval/chat_benchmarks/RepoBench/data/data/utils.py
+++ b/eval/chat_benchmarks/RepoBench/data/data/utils.py
@@ -30,7 +30,6 @@ def construct_prompt(data: dict, language: str = "python", tokenizer=None, max_t
 
     # if we assign the tokenizer and the max_token_nums, we will truncate the cross-file prompt to meet the constraint
     if tokenizer is not None and max_token_nums is not None:
-
         cross_file_prompt_token_nums = len(tokenizer.encode(cross_file_prompt))
         in_file_prompt_token_nums = len(tokenizer.encode(in_file_prompt))
 
diff --git a/eval/chat_benchmarks/RepoBench/data/utils.py b/eval/chat_benchmarks/RepoBench/data/utils.py
index e5c3d926..dd298e9b 100644
--- a/eval/chat_benchmarks/RepoBench/data/utils.py
+++ b/eval/chat_benchmarks/RepoBench/data/utils.py
@@ -30,7 +30,6 @@ def construct_prompt(data: dict, language: str = "python", tokenizer=None, max_t
 
     # if we assign the tokenizer and the max_token_nums, we will truncate the cross-file prompt to meet the constraint
     if tokenizer is not None and max_token_nums is not None:
-
         cross_file_prompt_token_nums = len(tokenizer.encode(cross_file_prompt))
         in_file_prompt_token_nums = len(tokenizer.encode(in_file_prompt))
 
diff --git a/eval/chat_benchmarks/RepoBench/eval.py b/eval/chat_benchmarks/RepoBench/eval.py
index cb5ea3fa..f57dabfd 100644
--- a/eval/chat_benchmarks/RepoBench/eval.py
+++ b/eval/chat_benchmarks/RepoBench/eval.py
@@ -8,7 +8,6 @@ def eval(
     path="results/deepseek-coder-1.3b-base-python",
     language="python",  # to calculate codebleu, we need to specify the language
 ):
-
     total_data_points = 0
     total_em_model, total_es_model, total_cb_model = 0, 0, 0
 
@@ -22,7 +21,6 @@ def eval(
             continue
 
         with open(filepath, "r") as f:
-
             data = []
             for line in f:
                 entry = json.loads(line.strip())
diff --git a/eval/chat_benchmarks/RepoBench/run.py b/eval/chat_benchmarks/RepoBench/run.py
index df18da3d..7aab1c85 100644
--- a/eval/chat_benchmarks/RepoBench/run.py
+++ b/eval/chat_benchmarks/RepoBench/run.py
@@ -150,7 +150,6 @@ def main(
     batch_size: int = 1,
     res_dir: str = "./results",
 ):
-
     # Load the dataset
     dataset = load_dataset(dataset_name, ignore_verifications=True)
 
diff --git a/eval/chat_benchmarks/WildBench/leaderboard/data_dir/_create_tables.py b/eval/chat_benchmarks/WildBench/leaderboard/data_dir/_create_tables.py
index ccbe3c96..7fb2c3b8 100644
--- a/eval/chat_benchmarks/WildBench/leaderboard/data_dir/_create_tables.py
+++ b/eval/chat_benchmarks/WildBench/leaderboard/data_dir/_create_tables.py
@@ -25,7 +25,6 @@
 task_mapping = {}
 wb_data = load_dataset("allenai/WildBench", "v2", split="test")
 for item in wb_data:
-
     tags = [item["primary_tag"]] + item["secondary_tags"]
     task_mapping[item["id"]] = []
     for tag in tags:
diff --git a/eval/chat_benchmarks/WildBench/src/eval.py b/eval/chat_benchmarks/WildBench/src/eval.py
index 6acd21a9..7bd4b81e 100644
--- a/eval/chat_benchmarks/WildBench/src/eval.py
+++ b/eval/chat_benchmarks/WildBench/src/eval.py
@@ -310,7 +310,6 @@ def shorten(text, K=-1):
 
 
 def placeholder_generation(args, candidates, references, histories, last_queries, checklists):
-
     with open(args.eval_template) as f:
         eval_template = f.read()
         print(f"Loaded the eval_template from {args.eval_template}")
diff --git a/eval/chat_benchmarks/WildBench/src/fastchat_conversation.py b/eval/chat_benchmarks/WildBench/src/fastchat_conversation.py
index d7becfc6..715d28bd 100644
--- a/eval/chat_benchmarks/WildBench/src/fastchat_conversation.py
+++ b/eval/chat_benchmarks/WildBench/src/fastchat_conversation.py
@@ -1487,7 +1487,6 @@ def get_conv_template(name: str) -> Conversation:
 
 
 if __name__ == "__main__":
-
     print("-- Vicuna template --")
     conv = get_conv_template("vicuna_v1.1")
     conv.append_message(conv.roles[0], "Hello!")
diff --git a/eval/chat_benchmarks/WildBench/src/hf_models.py b/eval/chat_benchmarks/WildBench/src/hf_models.py
index 16654dab..2efdc141 100644
--- a/eval/chat_benchmarks/WildBench/src/hf_models.py
+++ b/eval/chat_benchmarks/WildBench/src/hf_models.py
@@ -85,7 +85,6 @@ def infer_generate(self, input_data):
 
 
 class DecoderOnlyModelManager(ModelManager):
-
     def __init__(self, model_path, model_name, cache_dir=None, bf16=False, int8=False, bnb4=False, gptq=False):
         super().__init__(model_path, model_name)
         self.cache_dir = cache_dir
@@ -192,7 +191,6 @@ def load_model(self, device_str="cuda:0"):
         print("model device:", self.model.device)
 
     def infer_generate(self, input_data, args={}, device=None, remarks=None, pure_input_data=None):
-
         if not device:
             device = self.model.device
         if type(args) is dict:
diff --git a/eval/chat_benchmarks/WildBench/src/view_wb_eval.py b/eval/chat_benchmarks/WildBench/src/view_wb_eval.py
index a93daf2f..b446132d 100644
--- a/eval/chat_benchmarks/WildBench/src/view_wb_eval.py
+++ b/eval/chat_benchmarks/WildBench/src/view_wb_eval.py
@@ -26,7 +26,6 @@
 task_mapping = {}
 wb_data = load_dataset("allenai/WildBench", "v2", split="test")
 for item in wb_data:
-
     tags = [item["primary_tag"]] + item["secondary_tags"]
     task_mapping[item["id"]] = []
     for tag in tags:
diff --git a/eval/chat_benchmarks/alpaca_eval/docs/format_export_leaderboards.py b/eval/chat_benchmarks/alpaca_eval/docs/format_export_leaderboards.py
index 9b483b36..364e595a 100644
--- a/eval/chat_benchmarks/alpaca_eval/docs/format_export_leaderboards.py
+++ b/eval/chat_benchmarks/alpaca_eval/docs/format_export_leaderboards.py
@@ -41,9 +41,9 @@
 
         file_outputs = RESULTS_DIR / informal_name / "model_outputs.json"
         if file_outputs.is_file():
-            df.loc[idx, "samples"] = (
-                f"https://github.com/tatsu-lab/alpaca_eval/blob/main/results/{informal_name}/model_outputs.json"
-            )
+            df.loc[
+                idx, "samples"
+            ] = f"https://github.com/tatsu-lab/alpaca_eval/blob/main/results/{informal_name}/model_outputs.json"
 
     # if "length_controlled_winrate" never nan then we can use it as the main metric
     if "length_controlled_winrate" in cols_to_keep and df["length_controlled_winrate"].notna().all():
diff --git a/eval/chat_benchmarks/zeroeval/src/evaluation/crux_eval.py b/eval/chat_benchmarks/zeroeval/src/evaluation/crux_eval.py
index 5fcf946f..8500fb1d 100644
--- a/eval/chat_benchmarks/zeroeval/src/evaluation/crux_eval.py
+++ b/eval/chat_benchmarks/zeroeval/src/evaluation/crux_eval.py
@@ -146,7 +146,6 @@ def gen_results(run_name_folders):
 
 
 if __name__ == "__main__":
-
     data_name = "crux"  # by default if there is no sys.argv[1]
     if len(sys.argv) > 1:
         data_name = sys.argv[1]
diff --git a/eval/chat_benchmarks/zeroeval/src/evaluation/math_eval.py b/eval/chat_benchmarks/zeroeval/src/evaluation/math_eval.py
index fbd7c0c1..4914312d 100644
--- a/eval/chat_benchmarks/zeroeval/src/evaluation/math_eval.py
+++ b/eval/chat_benchmarks/zeroeval/src/evaluation/math_eval.py
@@ -196,7 +196,6 @@ def gen_results(run_name_folders):
 
 
 if __name__ == "__main__":
-
     data_name = sys.argv[1] if len(sys.argv) > 1 else "math-l5"
     if len(sys.argv) > 1:
         data_name = sys.argv[1]
diff --git a/eval/chat_benchmarks/zeroeval/src/fastchat_conversation.py b/eval/chat_benchmarks/zeroeval/src/fastchat_conversation.py
index 49327947..1dfab968 100644
--- a/eval/chat_benchmarks/zeroeval/src/fastchat_conversation.py
+++ b/eval/chat_benchmarks/zeroeval/src/fastchat_conversation.py
@@ -1487,7 +1487,6 @@ def get_conv_template(name: str) -> Conversation:
 
 
 if __name__ == "__main__":
-
     print("-- Vicuna template --")
     conv = get_conv_template("vicuna_v1.1")
     conv.append_message(conv.roles[0], "Hello!")
diff --git a/eval/chat_benchmarks/zeroeval/src/hf_models.py b/eval/chat_benchmarks/zeroeval/src/hf_models.py
index a666440d..9fca544d 100644
--- a/eval/chat_benchmarks/zeroeval/src/hf_models.py
+++ b/eval/chat_benchmarks/zeroeval/src/hf_models.py
@@ -85,7 +85,6 @@ def infer_generate(self, input_data):
 
 
 class DecoderOnlyModelManager(ModelManager):
-
     def __init__(
         self,
         model_path,
@@ -214,7 +213,6 @@ def load_model(self, device_str="cuda:0"):
         print("model device:", self.model.device)
 
     def _adapt_with_prefix(self, input_data, pure_input_data, n=3, args=None):
-
         if self.adapt_ckpt == "fixed":
             decoded_outputs = [["The answer is: "] for _ in range(len(input_data))]
         # print(input_data_clean)
@@ -248,7 +246,6 @@ def _adapt_with_prefix(self, input_data, pure_input_data, n=3, args=None):
         return prefixes, input_data_with_prefixes
 
     def infer_generate(self, input_data, args={}, device=None, remarks=None, pure_input_data=None):
-
         if self.adapt_mode in ["prefix", "retrieve+prefix"]:
             prefixes, input_data = self._adapt_with_prefix(input_data, pure_input_data, args=args)
 
@@ -366,7 +363,6 @@ def completion_with_backoff(**kwargs):
 
 
 class OpenAIModelManager(ModelManager):
-
     def __init__(self, model_name):
         super().__init__(model_name, model_name)
 
diff --git a/eval/chat_benchmarks/zeroeval/src/unified_infer.py b/eval/chat_benchmarks/zeroeval/src/unified_infer.py
index fdd7e74e..aaed79e9 100644
--- a/eval/chat_benchmarks/zeroeval/src/unified_infer.py
+++ b/eval/chat_benchmarks/zeroeval/src/unified_infer.py
@@ -240,7 +240,6 @@ def sanitize_args(args):
     todo_inputs = model_inputs[num_skipped:]
 
     if args.engine == "vllm":
-
         sampling_params = SamplingParams(
             top_p=args.top_p,
             temperature=args.temperature,
diff --git a/eval/eval.py b/eval/eval.py
index 2e703480..b8563ee5 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -536,7 +536,9 @@ def add_results_metadata(results: Dict, batch_sizes_list: List[int], args: argpa
         "model": (
             args.model
             if isinstance(args.model, str)
-            else args.model.config._name_or_path if hasattr(args.model, "config") else type(args.model).__name__
+            else args.model.config._name_or_path
+            if hasattr(args.model, "config")
+            else type(args.model).__name__
         ),
         "model_args": args.model_args,
         "tasks": args.tasks,

From 25bc1227576e588788109aa4050fee597f5a6933 Mon Sep 17 00:00:00 2001
From: jean-mercat <jean.mercat@tri.global>
Date: Mon, 2 Jun 2025 18:51:18 -0700
Subject: [PATCH 4/6] hand formatted last issue

---
 eval/chat_benchmarks/HMMT/matharena/grader.py | 166 ++++++++++--------
 1 file changed, 95 insertions(+), 71 deletions(-)

diff --git a/eval/chat_benchmarks/HMMT/matharena/grader.py b/eval/chat_benchmarks/HMMT/matharena/grader.py
index aaa90c83..de454e20 100644
--- a/eval/chat_benchmarks/HMMT/matharena/grader.py
+++ b/eval/chat_benchmarks/HMMT/matharena/grader.py
@@ -13,33 +13,46 @@
 from matharena.parser import parse_grading, WarningType
 
 
-
 def similar(a, b):
     return SequenceMatcher(None, a, b).ratio() > 0.8  # Allow minor formatting differences
 
+
 def clean_string_to_json(text: str) -> str:
-    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
     text = re.sub(r"```json\n(.*?)\n```", r"\1", text, flags=re.DOTALL)
     text = text.replace("`", "")
     return text
 
+
 def format_grading_scheme(scheme, problem_id):
     formatted_str = ""
-    if scheme['problem_idx'] != problem_id:
-        raise ValueError(f'Incorrect schema given for problem {problem_id}')
+    if scheme["problem_idx"] != problem_id:
+        raise ValueError(f"Incorrect schema given for problem {problem_id}")
     total_points = 0
-    for category in scheme['grading_scheme']:
-        total_points += category['points']
-        formatted_str += f'Category: {category['title']}\nAvailable points: {category['points']}\nDescription: {category['desc']}\n\n'
-    
-    if total_points != scheme['points']:
-        raise ValueError(f'Total points in schema for problem {problem_id} totals {total_points}, but should be {scheme['points']}')
-    
+    for category in scheme["grading_scheme"]:
+        total_points += category["points"]
+        formatted_str += f"Category: {category['title']}\n"
+        formatted_str += f"Available points: {category['points']}\n"
+        formatted_str += f"Description: {category['desc']}\n\n"
+
+    if total_points != scheme["points"]:
+        raise ValueError(
+            f"Total points in schema for problem {problem_id} totals {total_points}, but should be {scheme['points']}"
+        )
+
     return formatted_str
 
-def run_grader(grader_config, solver_config_path, competition, skip_existing=False, 
-               output_folder="outputs", grading_folder="autogrades", 
-               competition_config_folder="competition_configs", autograding_config_path="configs/autograding/config.yaml"):
+
+def run_grader(
+    grader_config,
+    solver_config_path,
+    competition,
+    skip_existing=False,
+    output_folder="outputs",
+    grading_folder="autogrades",
+    competition_config_folder="competition_configs",
+    autograding_config_path="configs/autograding/config.yaml",
+):
     model = grader_config["model"]
     n = grader_config["n"]
     api = grader_config["api"]
@@ -82,7 +95,7 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
     marking_schemas = {}
 
     all_messages_per_problem = {i: [] for i in range(len(problems))}
-    all_evals_per_problem_per_solution = {i : {} for i in range(len(problems))}
+    all_evals_per_problem_per_solution = {i: {} for i in range(len(problems))}
 
     for i, problem in enumerate(problems):
         problem_id = problem["problem_idx"]
@@ -92,37 +105,43 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
             raise ValueError(f"Could not find the solutions for {problem_id} in {output_dir}")
         else:
             data_file = json.load(open(output_file))
-            problem['anon_id'] = data_file['anonymous_id']
+            problem["anon_id"] = data_file["anonymous_id"]
             messages = data_file["messages"]
             all_evals_per_problem_per_solution[i] = {i: [] for i in range(n_evals)}
-            messages = [
-                messages_one for messages_one in messages if len(messages_one[-1]["content"]) > 0
-            ]
+            messages = [messages_one for messages_one in messages if len(messages_one[-1]["content"]) > 0]
             all_messages_per_problem[i] = messages
 
         marking_schema = format_grading_scheme(problem, problem_id)
-        marking_schemas[i] = problem['grading_scheme']
+        marking_schemas[i] = problem["grading_scheme"]
 
         for j in range(n_evals):
-            auto_grading_file = os.path.join(autograder_dir,f"{problem_id}/{problem['anon_id']}_{grader_config['model'].split('/')[-1]}-{j}.json")
-            
+            auto_grading_file = os.path.join(
+                autograder_dir, f"{problem_id}/{problem['anon_id']}_{grader_config['model'].split('/')[-1]}-{j}.json"
+            )
+
             if skip_existing and os.path.exists(auto_grading_file):
                 data_file = json.load(open(auto_grading_file))
-                messages = [messages_one['raw'] for messages_one in data_file]
+                messages = [messages_one["raw"] for messages_one in data_file]
                 all_evals_per_problem_per_solution[i][j] = messages
                 if len(all_evals_per_problem_per_solution[i][j]) == n:
-                    calculate_grading_results(problem, autograder_dir, 
-                                            all_evals_per_problem_per_solution[i][j], marking_schemas[i],
-                                            i, j, grader_model_name=grader_config['model'].split('/')[-1])
+                    calculate_grading_results(
+                        problem,
+                        autograder_dir,
+                        all_evals_per_problem_per_solution[i][j],
+                        marking_schemas[i],
+                        i,
+                        j,
+                        grader_model_name=grader_config["model"].split("/")[-1],
+                    )
                 continue
             for _, message in enumerate(messages):
                 problem_statement = problem["problem"]
                 grading_prompt = prompt_template.format(
-                    problem_statement=problem_statement, 
-                    marking_schema=marking_schema, 
-                    correct_solution=problem['sample_solution'],  
-                    example_grading=problem['sample_grading'],
-                    solution=message if skip_existing and os.path.exists(auto_grading_file) else message[-1]["content"]
+                    problem_statement=problem_statement,
+                    marking_schema=marking_schema,
+                    correct_solution=problem["sample_solution"],
+                    example_grading=problem["sample_grading"],
+                    solution=message if skip_existing and os.path.exists(auto_grading_file) else message[-1]["content"],
                 )
                 batch_idx_to_problem_idx[len(batch_prompts)] = (i, j)
                 batch_prompts.append((grading_prompt, None))
@@ -131,11 +150,7 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
 
     if len(batch_prompts) == 0:
         return
-    api = APIQuery(
-        model=model, 
-        api=api,
-        **kwargs
-    )
+    api = APIQuery(model=model, api=api, **kwargs)
 
     cot_solver = CoTSolver(
         querier=api,
@@ -144,25 +159,33 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
     for idx, messages, _ in cot_solver.solve(batch_prompts):
         problem_idx, grader_idx = batch_idx_to_problem_idx[idx]
         problem = problems[problem_idx]
-        all_evals_per_problem_per_solution[problem_idx][grader_idx].append(messages[-1]['content'])
+        all_evals_per_problem_per_solution[problem_idx][grader_idx].append(messages[-1]["content"])
         # check if the whole problem is finished
         if len(all_evals_per_problem_per_solution[problem_idx][grader_idx]) == n:
-            calculate_grading_results(problem, autograder_dir, 
-                                      all_evals_per_problem_per_solution[problem_idx][grader_idx], marking_schemas[problem_idx],
-                                      problem_idx, grader_idx, grader_model_name=grader_config['model'].split('/')[-1])
-
-def calculate_grading_results(problem, output_dir, gradings_per_solution, marking_schema, 
-                              problem_idx, grader_idx, grader_model_name):
+            calculate_grading_results(
+                problem,
+                autograder_dir,
+                all_evals_per_problem_per_solution[problem_idx][grader_idx],
+                marking_schemas[problem_idx],
+                problem_idx,
+                grader_idx,
+                grader_model_name=grader_config["model"].split("/")[-1],
+            )
+
+
+def calculate_grading_results(
+    problem, output_dir, gradings_per_solution, marking_schema, problem_idx, grader_idx, grader_model_name
+):
     problem_id = problem["problem_idx"]
     anon_id = problem["anon_id"]
-    
+
     output_file = os.path.join(output_dir, f"{problem_id}/{anon_id}_{grader_model_name}-{grader_idx}.json")
-    os.makedirs(f'{output_dir}/{problem_id}', exist_ok=True)
+    os.makedirs(f"{output_dir}/{problem_id}", exist_ok=True)
 
     outputs = [{} for _ in gradings_per_solution]
 
     for i, message in enumerate(gradings_per_solution):
-        outputs[i]['raw'] = message
+        outputs[i]["raw"] = message
         warning = WarningType.NONE
         parsed_grading = {}
         try:
@@ -172,36 +195,40 @@ def calculate_grading_results(problem, output_dir, gradings_per_solution, markin
                 parsed_grading = json5.loads(clean_string_to_json(message), strict=False)
             except Exception:
                 parsed_grading = parse_grading(message)
-            if not 'points' in parsed_grading:
-                logger.error(f'Final points were not generated for grader {grader_idx} of {problem_idx}:\n {message}')
-                warning = max(warning,WarningType.MAJOR)
-            if not 'details' in parsed_grading:
-                if not 'scheme' in parsed_grading:
-                    logger.error(f'Not scoring details found for grader {grader_idx} of {problem_idx}:\n {message}')
-                    warning = max(warning,WarningType.MAJOR)
+            if not "points" in parsed_grading:
+                logger.error(f"Final points were not generated for grader {grader_idx} of {problem_idx}:\n {message}")
+                warning = max(warning, WarningType.MAJOR)
+            if not "details" in parsed_grading:
+                if not "scheme" in parsed_grading:
+                    logger.error(f"Not scoring details found for grader {grader_idx} of {problem_idx}:\n {message}")
+                    warning = max(warning, WarningType.MAJOR)
                 else:
-                    parsed_grading['details'] = parsed_grading['scheme']
-            elif len(parsed_grading['details']) != len(marking_schema):
-                logger.error(f'Mismatch between marking schema lengths')
-                warning = max(warning,WarningType.MAJOR)
+                    parsed_grading["details"] = parsed_grading["scheme"]
+            elif len(parsed_grading["details"]) != len(marking_schema):
+                logger.error(f"Mismatch between marking schema lengths")
+                warning = max(warning, WarningType.MAJOR)
             else:
-                if anon_id == 'ecddbb':
+                if anon_id == "ecddbb":
                     breakpoint()
                 final_points = 0
-                for (given, expected) in zip(parsed_grading["details"], marking_schema):
+                for given, expected in zip(parsed_grading["details"], marking_schema):
                     if not similar(given["title"], expected["title"]):
                         logger.error(f"Title mismatch: '{given['title']}' vs '{expected['title']}'")
                         warning = max(warning, WarningType.MAJOR)
                     elif given["points"] > expected["points"]:
-                        logger.warning(f"Warning: Given points ({given['points']}) exceed max allowed ({expected['points']}) for category '{given['title']}'")
+                        logger.warning(
+                            f"Warning: Given points ({given['points']}) exceed max allowed ({expected['points']}) for category '{given['title']}'"
+                        )
                         warning = max(warning, WarningType.MINOR)
                         given["points"] = expected["points"]
                     elif given["points"] < 0:
-                        logger.warning(f"Warning: Given points ({given['points']}) are negative for category '{given['title']}'")
+                        logger.warning(
+                            f"Warning: Given points ({given['points']}) are negative for category '{given['title']}'"
+                        )
                         warning = max(warning, WarningType.MINOR)
                         given["points"] = 0
 
-                    given["title"] = expected["title"] 
+                    given["title"] = expected["title"]
                     final_points += given["points"]
                 parsed_grading["points"] = final_points
 
@@ -211,17 +238,14 @@ def calculate_grading_results(problem, output_dir, gradings_per_solution, markin
             parsed_grading = {
                 "points": 0,
                 "details": [
-                    {
-                        "title": item['title'],
-                        "points": 0,
-                        "desc": "The grading could not be parsed."
-                    } for item in marking_schema
-                ]
+                    {"title": item["title"], "points": 0, "desc": "The grading could not be parsed."}
+                    for item in marking_schema
+                ],
             }
 
-        outputs[i]['warning'] = warning.value
+        outputs[i]["warning"] = warning.value
         for k in parsed_grading:
             outputs[i][k] = parsed_grading[k]
-    
+
     with open(output_file, "w") as f:
-        json.dump(outputs, f)
\ No newline at end of file
+        json.dump(outputs, f)

From 52526d1e6026f67dc98fc87af87cfd4ef553f333 Mon Sep 17 00:00:00 2001
From: jean-mercat <jean.mercat@tri.global>
Date: Tue, 3 Jun 2025 08:12:48 -0700
Subject: [PATCH 5/6] set back default max_tokens when none

---
 eval/chat_benchmarks/LiveCodeBench/eval_instruct.py | 2 +-
 eval/chat_benchmarks/MBPP/eval_instruct.py          | 2 +-
 eval/chat_benchmarks/MTBench/eval_instruct.py       | 2 +-
 eval/chat_benchmarks/MultiPLE/eval_instruct.py      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
index ac07773c..5c36e5bf 100644
--- a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
@@ -66,7 +66,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens
+        self.max_new_tokens = max_tokens or 32768
         self.seed = seed
         self.n_repeat = 6
 
diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py
index 882f2a69..6f559a68 100644
--- a/eval/chat_benchmarks/MBPP/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPP/eval_instruct.py
@@ -45,7 +45,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens
+        self.max_tokens = max_tokens or 512
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
index d075c900..266d3b06 100644
--- a/eval/chat_benchmarks/MTBench/eval_instruct.py
+++ b/eval/chat_benchmarks/MTBench/eval_instruct.py
@@ -93,7 +93,7 @@ def __init__(
             print(f"Warning: Overwriting config.judge_model = {annotator_model} ")
             config.judge_model = annotator_model
         self.config = config or MTBenchConfig(judge_model=annotator_model)
-        self.config.max_new_token = max_tokens
+        self.config.max_new_token = max_tokens or 1024
         self.debug = debug
 
         # Setup paths
diff --git a/eval/chat_benchmarks/MultiPLE/eval_instruct.py b/eval/chat_benchmarks/MultiPLE/eval_instruct.py
index 744fae11..0e17afbe 100644
--- a/eval/chat_benchmarks/MultiPLE/eval_instruct.py
+++ b/eval/chat_benchmarks/MultiPLE/eval_instruct.py
@@ -115,7 +115,7 @@ def __init__(
         super().__init__(logger)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens
+        self.max_tokens = max_tokens or 1024
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug

From 96a38892506b0f0a5c0c3049151955ffb2cee38b Mon Sep 17 00:00:00 2001
From: jean-mercat <jean.mercat@tri.global>
Date: Tue, 3 Jun 2025 08:49:44 -0700
Subject: [PATCH 6/6] By default don't pass None arguments to benchmark init
 (which uses the default value instead of None). Allows to avoid special
 handling of max_tokens argument

---
 eval/chat_benchmarks/AIME24/eval_instruct.py    |  6 ++----
 eval/chat_benchmarks/AIME25/eval_instruct.py    |  2 +-
 eval/chat_benchmarks/AIW/eval_instruct.py       |  4 ++--
 eval/chat_benchmarks/AMC23/eval_instruct.py     |  4 ++--
 .../BigCodeBench/eval_instruct.py               |  4 ++--
 eval/chat_benchmarks/CodeElo/eval_instruct.py   |  6 ++----
 .../chat_benchmarks/CodeForces/eval_instruct.py |  4 ++--
 eval/chat_benchmarks/CruxEval/eval_instruct.py  |  4 ++--
 .../GPQADiamond/eval_instruct.py                |  4 ++--
 eval/chat_benchmarks/HLE/eval_instruct.py       |  6 ++----
 eval/chat_benchmarks/HMMT/eval_instruct.py      |  6 ++----
 eval/chat_benchmarks/HumanEval/eval_instruct.py |  4 ++--
 .../HumanEvalPlus/eval_instruct.py              |  4 ++--
 eval/chat_benchmarks/IFEval/eval_instruct.py    |  4 ++--
 eval/chat_benchmarks/JEEBench/eval_instruct.py  |  6 ++----
 eval/chat_benchmarks/LiveBench/eval_instruct.py |  4 ++--
 .../LiveCodeBench/eval_instruct.py              |  4 ++--
 .../LiveCodeBenchv5/eval_instruct.py            |  6 ++----
 eval/chat_benchmarks/MATH500/eval_instruct.py   |  6 ++----
 eval/chat_benchmarks/MBPP/eval_instruct.py      |  4 ++--
 eval/chat_benchmarks/MBPPPlus/eval_instruct.py  |  4 ++--
 eval/chat_benchmarks/MTBench/eval_instruct.py   |  4 ++--
 .../MTBench/fastchat/protocol/api_protocol.py   |  4 ++--
 .../fastchat/protocol/openai_api_protocol.py    |  4 ++--
 .../MTBench/fastchat/serve/api_provider.py      |  4 ++--
 .../MTBench/fastchat/serve/openai_api_server.py |  2 +-
 eval/chat_benchmarks/MultiPLE/eval_instruct.py  |  2 +-
 eval/chat_benchmarks/WildBench/eval_instruct.py |  4 ++--
 .../src/alpaca_eval/decoders/cohere.py          |  2 +-
 eval/task.py                                    | 17 ++++++++++++-----
 30 files changed, 66 insertions(+), 73 deletions(-)

diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py
index 7cbe5701..2a96b88c 100644
--- a/eval/chat_benchmarks/AIME24/eval_instruct.py
+++ b/eval/chat_benchmarks/AIME24/eval_instruct.py
@@ -27,7 +27,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AIME24/data/aime24.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -44,9 +44,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py
index 08d339bd..9bfeaf02 100644
--- a/eval/chat_benchmarks/AIME25/eval_instruct.py
+++ b/eval/chat_benchmarks/AIME25/eval_instruct.py
@@ -43,7 +43,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py
index 96d8b04d..c3b54c86 100644
--- a/eval/chat_benchmarks/AIW/eval_instruct.py
+++ b/eval/chat_benchmarks/AIW/eval_instruct.py
@@ -23,7 +23,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AIW/data/aiw_data.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
         n_trials: int = 100,  # Run 100 trials
@@ -41,7 +41,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_trials = n_trials
 
diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
index 24f88e21..094c77b7 100644
--- a/eval/chat_benchmarks/AMC23/eval_instruct.py
+++ b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -29,7 +29,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AMC23/data/amc23.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -47,7 +47,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.n_repeat = 10
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/BigCodeBench/eval_instruct.py b/eval/chat_benchmarks/BigCodeBench/eval_instruct.py
index 8b0b58f4..5b56647d 100644
--- a/eval/chat_benchmarks/BigCodeBench/eval_instruct.py
+++ b/eval/chat_benchmarks/BigCodeBench/eval_instruct.py
@@ -70,7 +70,7 @@ def __init__(
         self,
         language: str = "python",
         data_dir: str = BIGCODEBENCH_PATH,
-        max_tokens: Optional[int] = 1280,
+        max_tokens: int = 1280,
         num_workers: int = 32,
         timeout: float = 120,
         debug: bool = False,
@@ -98,7 +98,7 @@ def __init__(
         self.language = language
         os.makedirs(data_dir, exist_ok=True)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1280
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/CodeElo/eval_instruct.py b/eval/chat_benchmarks/CodeElo/eval_instruct.py
index a561e836..eae33254 100644
--- a/eval/chat_benchmarks/CodeElo/eval_instruct.py
+++ b/eval/chat_benchmarks/CodeElo/eval_instruct.py
@@ -48,7 +48,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -63,9 +63,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py
index 30243392..0c92392e 100644
--- a/eval/chat_benchmarks/CodeForces/eval_instruct.py
+++ b/eval/chat_benchmarks/CodeForces/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -62,7 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
diff --git a/eval/chat_benchmarks/CruxEval/eval_instruct.py b/eval/chat_benchmarks/CruxEval/eval_instruct.py
index 7580e254..4049df1f 100644
--- a/eval/chat_benchmarks/CruxEval/eval_instruct.py
+++ b/eval/chat_benchmarks/CruxEval/eval_instruct.py
@@ -132,7 +132,7 @@ class CruxEvalBenchmark(BaseBenchmark):
     def __init__(
         self,
         data_dir: str = CruxEval_PATH,
-        max_tokens: Optional[int] = 2048,
+        max_tokens: int = 2048,
         num_workers: int = 32,
         timeout: float = 120,
         debug: bool = False,
@@ -155,7 +155,7 @@ def __init__(
         self.language = "python"
         os.makedirs(data_dir, exist_ok=True)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 2048
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
index 46288c69..e49cbd37 100644
--- a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
+++ b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
@@ -35,7 +35,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -51,7 +51,7 @@ def __init__(
         self.dataset_name = "Idavidrein/gpqa"
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.n_repeat = 3
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/HLE/eval_instruct.py b/eval/chat_benchmarks/HLE/eval_instruct.py
index 8df8d9a6..275668ab 100644
--- a/eval/chat_benchmarks/HLE/eval_instruct.py
+++ b/eval/chat_benchmarks/HLE/eval_instruct.py
@@ -63,7 +63,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -77,9 +77,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py
index 32b46dfb..22f76801 100644
--- a/eval/chat_benchmarks/HMMT/eval_instruct.py
+++ b/eval/chat_benchmarks/HMMT/eval_instruct.py
@@ -29,7 +29,7 @@ def __init__(
         self,
         dataset_name: str = "MathArena/hmmt_feb_2025",
         debug: bool = False,
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         seed: List[int] = [0, 1234, 1234, 1234],
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
@@ -47,9 +47,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.dataset_name = dataset_name
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py
index fb5c54e9..9b54653f 100644
--- a/eval/chat_benchmarks/HumanEval/eval_instruct.py
+++ b/eval/chat_benchmarks/HumanEval/eval_instruct.py
@@ -22,7 +22,7 @@ def __init__(
         self,
         languages: List[str] = ["python", "sh"],
         data_dir: str = "eval/chat_benchmarks/HumanEval/data",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
@@ -45,7 +45,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py b/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
index f63fd7b2..d12b31a7 100644
--- a/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
+++ b/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
@@ -22,7 +22,7 @@ def __init__(
         self,
         languages: List[str] = ["python"],
         data_dir: str = "eval/chat_benchmarks/HumanEvalPlus/data",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
@@ -45,7 +45,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py
index 79be036f..907a8019 100644
--- a/eval/chat_benchmarks/IFEval/eval_instruct.py
+++ b/eval/chat_benchmarks/IFEval/eval_instruct.py
@@ -18,7 +18,7 @@ def __init__(
         start_idx: int = 10,
         end_idx: int = 510,
         debug: bool = False,
-        max_tokens: Optional[int] = 512,
+        max_tokens: int = 512,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -37,7 +37,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 512
+        self.max_tokens = max_tokens
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/JEEBench/eval_instruct.py b/eval/chat_benchmarks/JEEBench/eval_instruct.py
index 5ba53541..05881a19 100644
--- a/eval/chat_benchmarks/JEEBench/eval_instruct.py
+++ b/eval/chat_benchmarks/JEEBench/eval_instruct.py
@@ -78,7 +78,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -92,9 +92,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/LiveBench/eval_instruct.py b/eval/chat_benchmarks/LiveBench/eval_instruct.py
index 3e4e0339..760b4a2b 100644
--- a/eval/chat_benchmarks/LiveBench/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveBench/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         release_date: str = "2024-08-31",
         remove_existing_file: bool = True,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 4096,
+        max_tokens: int = 4096,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -72,7 +72,7 @@ def __init__(
             self.release_date = "2024-06-24"
             self.num_workers = 1
         else:
-            self.max_tokens = max_tokens if max_tokens is not None else 4096
+            self.max_tokens = max_tokens
         self.temperature = temperature
         self.num_choices = num_choices
         self.all_release_dates = ["2024-07-26", "2024-06-24", "2024-08-31", "2024-11-25"]
diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
index 5c36e5bf..19dcec42 100644
--- a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
@@ -51,7 +51,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -66,7 +66,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens or 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 6
 
diff --git a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
index 9ae5ee56..e1cc5c75 100644
--- a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
+++ b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -62,9 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py
index f082ce0f..eec964f0 100644
--- a/eval/chat_benchmarks/MATH500/eval_instruct.py
+++ b/eval/chat_benchmarks/MATH500/eval_instruct.py
@@ -27,7 +27,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/MATH500/data/math500.jsonl",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -45,9 +45,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
         """
diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py
index 6f559a68..0fcb8a3b 100644
--- a/eval/chat_benchmarks/MBPP/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPP/eval_instruct.py
@@ -26,7 +26,7 @@ def __init__(
         start_idx: int = 10,
         end_idx: int = 510,
         debug: bool = False,
-        max_tokens: Optional[int] = 512,
+        max_tokens: int = 512,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -45,7 +45,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens or 512
+        self.max_tokens = max_tokens
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/MBPPPlus/eval_instruct.py b/eval/chat_benchmarks/MBPPPlus/eval_instruct.py
index d56d689b..6094bc17 100644
--- a/eval/chat_benchmarks/MBPPPlus/eval_instruct.py
+++ b/eval/chat_benchmarks/MBPPPlus/eval_instruct.py
@@ -25,7 +25,7 @@ def __init__(
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -43,7 +43,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
index 266d3b06..9a1e4f79 100644
--- a/eval/chat_benchmarks/MTBench/eval_instruct.py
+++ b/eval/chat_benchmarks/MTBench/eval_instruct.py
@@ -71,7 +71,7 @@ def __init__(
         config: Optional[MTBenchConfig] = None,
         debug: bool = False,
         annotator_model: str = "gpt-4o-mini-2024-07-18",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -93,7 +93,7 @@ def __init__(
             print(f"Warning: Overwriting config.judge_model = {annotator_model} ")
             config.judge_model = annotator_model
         self.config = config or MTBenchConfig(judge_model=annotator_model)
-        self.config.max_new_token = max_tokens or 1024
+        self.config.max_new_token = max_tokens
         self.debug = debug
 
         # Setup paths
diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py
index 2dc99449..589e78cc 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py
@@ -55,7 +55,7 @@ class APIChatCompletionRequest(BaseModel):
     top_p: Optional[float] = 1.0
     top_k: Optional[int] = -1
     n: Optional[int] = 1
-    max_tokens: Optional[int] = None
+    max_tokens: int = 1024
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     user: Optional[str] = None
@@ -129,7 +129,7 @@ class CompletionRequest(BaseModel):
     suffix: Optional[str] = None
     temperature: Optional[float] = 0.7
     n: Optional[int] = 1
-    max_tokens: Optional[int] = 16
+    max_tokens: int = 16
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     top_p: Optional[float] = 1.0
diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py
index bb50a5ef..38713ed5 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py
@@ -66,7 +66,7 @@ class ChatCompletionRequest(BaseModel):
     top_p: Optional[float] = 1.0
     top_k: Optional[int] = -1
     n: Optional[int] = 1
-    max_tokens: Optional[int] = None
+    max_tokens: int = 1024
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0
@@ -154,7 +154,7 @@ class CompletionRequest(BaseModel):
     suffix: Optional[str] = None
     temperature: Optional[float] = 0.7
     n: Optional[int] = 1
-    max_tokens: Optional[int] = 16
+    max_tokens: int = 16
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     top_p: Optional[float] = 1.0
diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py b/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py
index ebf7f25a..07184521 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py
@@ -957,7 +957,7 @@ def cohere_api_stream_iter(
     messages: list,
     temperature: Optional[float] = None,  # The SDK or API handles None for all parameters following
     top_p: Optional[float] = None,
-    max_new_tokens: Optional[int] = None,
+    max_new_tokens: int = 1024,
     api_key: Optional[str] = None,  # default is env var CO_API_KEY
     api_base: Optional[str] = None,
 ):
@@ -1084,7 +1084,7 @@ def reka_api_stream_iter(
     messages: list,
     temperature: Optional[float] = None,  # The SDK or API handles None for all parameters following
     top_p: Optional[float] = None,
-    max_new_tokens: Optional[int] = None,
+    max_new_tokens: int = 1024,
     api_key: Optional[str] = None,  # default is env var CO_API_KEY
     api_base: Optional[str] = None,
 ):
diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py b/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py
index 86e63cd1..9b5e58d5 100644
--- a/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py
+++ b/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py
@@ -266,7 +266,7 @@ async def get_gen_params(
     top_k: Optional[int],
     presence_penalty: Optional[float],
     frequency_penalty: Optional[float],
-    max_tokens: Optional[int],
+    max_tokens: int,
     echo: Optional[bool],
     logprobs: Optional[int] = None,
     stop: Optional[Union[str, List[str]]],
diff --git a/eval/chat_benchmarks/MultiPLE/eval_instruct.py b/eval/chat_benchmarks/MultiPLE/eval_instruct.py
index 0e17afbe..744fae11 100644
--- a/eval/chat_benchmarks/MultiPLE/eval_instruct.py
+++ b/eval/chat_benchmarks/MultiPLE/eval_instruct.py
@@ -115,7 +115,7 @@ def __init__(
         super().__init__(logger)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens or 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py
index 60c8384e..cfe73a36 100644
--- a/eval/chat_benchmarks/WildBench/eval_instruct.py
+++ b/eval/chat_benchmarks/WildBench/eval_instruct.py
@@ -76,7 +76,7 @@ def __init__(
         config: Optional[WildBenchConfig] = None,
         annotator_model: str = "gpt-4o-mini-2024-07-18",
         debug: bool = False,
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -96,7 +96,7 @@ def __init__(
             config.model = annotator_model
         self.config = config or WildBenchConfig(model=annotator_model)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_new_tokens = max_tokens
 
         # Task category mapping
         self.task_group_mapping = {
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py
index c26c822d..dfe60aac 100644
--- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py
+++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py
@@ -73,7 +73,7 @@ def cohere_completions(
 def _cohere_completion_helper(
     prompt: str,
     cohere_api_keys: Optional[Sequence[str]] = (constants.COHERE_API_KEY,),
-    max_tokens: Optional[int] = 1000,
+    max_tokens: int = 1000,
     temperature: Optional[float] = 0.7,
     max_tries=5,
     **kwargs,
diff --git a/eval/task.py b/eval/task.py
index 42e77824..70962115 100644
--- a/eval/task.py
+++ b/eval/task.py
@@ -234,13 +234,20 @@ def _register_benchmark(self, name: str, benchmark_class: Type[BaseBenchmark]):
             valid_kwargs = {}
 
             # Only pass kwargs that the benchmark's __init__ accepts
+            # Filter out None values to let benchmarks use their default values
             for param_name, param in init_params.items():
                 if param_name in self.benchmark_kwargs:
-                    valid_kwargs[param_name] = self.benchmark_kwargs[param_name]
-                    self.logger.debug(f"Passing {param_name} to {name} benchmark")
-
-            # Ensure system_instruction is passed if available
-            if "system_instruction" in self.benchmark_kwargs:
+                    value = self.benchmark_kwargs[param_name]
+                    # Only pass the argument if it's not None, so benchmarks can use defaults
+                    if value is not None:
+                        valid_kwargs[param_name] = value
+                        self.logger.debug(f"Passing {param_name}={value} to {name} benchmark")
+
+            # Ensure system_instruction is passed if available and not None
+            if (
+                "system_instruction" in self.benchmark_kwargs
+                and self.benchmark_kwargs["system_instruction"] is not None
+            ):
                 valid_kwargs["system_instruction"] = self.benchmark_kwargs["system_instruction"]
 
             instance = benchmark_class(**valid_kwargs)