diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py index fda951b9..41b7ea5d 100644 --- a/eval/chat_benchmarks/AIME24/eval_instruct.py +++ b/eval/chat_benchmarks/AIME24/eval_instruct.py @@ -61,9 +61,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: examples = self.load_questions() # Prepare instances for model all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -98,10 +97,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for AIME24...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for AIME24...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: return None diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py index 9ed92bc6..58cbddcd 100644 --- a/eval/chat_benchmarks/AIME25/eval_instruct.py +++ b/eval/chat_benchmarks/AIME25/eval_instruct.py @@ -60,9 +60,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: examples = self.load_questions() # Prepare instances for model all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -97,10 +96,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for AIME25...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for AIME25...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: return None diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py index 0585b435..0ba076a0 100644 --- a/eval/chat_benchmarks/AMC23/eval_instruct.py +++ b/eval/chat_benchmarks/AMC23/eval_instruct.py @@ -72,9 +72,9 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: model_name = model.model_args["model"] all_outputs = [] + all_instances = [] for i in range(self.n_repeat): seed = [s + i for s in self.seed] - all_instances = [] for idx, example in enumerate(examples): messages = [ {"role": "user", "content": PROMPT.format(problem=example["question"])}, @@ -107,11 +107,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for AMC23...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for AMC23...") + all_outputs = self.compute(model, all_instances) + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: return None diff --git a/eval/chat_benchmarks/CodeElo/eval_instruct.py b/eval/chat_benchmarks/CodeElo/eval_instruct.py index 7f2ca4ef..9526e931 100644 --- a/eval/chat_benchmarks/CodeElo/eval_instruct.py +++ b/eval/chat_benchmarks/CodeElo/eval_instruct.py @@ -130,9 +130,8 @@ def make_html_problem(problem): return html_output instruction = """"You are a coding expert. Given a competition-level coding problem, you need to write a Python program to solve it. You may start by outlining your thought process. In the end, please provide the complete code in a code block enclosed with ``` ```. The code should take stdin as input and print the output.""" - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -158,10 +157,13 @@ def make_html_problem(problem): instance.repeat_idx = i all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for CodeElo...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for CodeElo...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py index e355e6db..12839e8a 100644 --- a/eval/chat_benchmarks/CodeForces/eval_instruct.py +++ b/eval/chat_benchmarks/CodeForces/eval_instruct.py @@ -123,8 +123,8 @@ def make_html_problem(problem): instruction = """You are a coding expert. Given a competition-level coding problem, you need to write a Python program to solve it. You may start by outlining your thought process. In the end, please provide the complete code in a code block enclosed with ``` ```. The code should take stdin as input and print the output. Your program should be a Python function generated from the given prompt. Simply call the function after the definition.""" + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -150,10 +150,13 @@ def make_html_problem(problem): instance.repeat_idx = i all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for CodeForces...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for CodeForces...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py index 9548ab9e..8c624ab6 100644 --- a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py +++ b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py @@ -77,9 +77,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: model_name = model.model_args["model"] all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -111,10 +110,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: instance.repeat_idx = i all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for GPQADiamond...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for GPQADiamond...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: diff --git a/eval/chat_benchmarks/HLE/eval_instruct.py b/eval/chat_benchmarks/HLE/eval_instruct.py index adcb6e58..f7fb13e1 100644 --- a/eval/chat_benchmarks/HLE/eval_instruct.py +++ b/eval/chat_benchmarks/HLE/eval_instruct.py @@ -98,9 +98,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: # Prepare instances for model all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -133,10 +132,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for HLE...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for HLE...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: return None diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py index b0175bed..b08346ab 100644 --- a/eval/chat_benchmarks/HMMT/eval_instruct.py +++ b/eval/chat_benchmarks/HMMT/eval_instruct.py @@ -64,9 +64,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: examples = self.load_questions() # Prepare instances for model all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -101,10 +100,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for HMMT...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for HMMT...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: return None diff --git a/eval/chat_benchmarks/JEEBench/eval_instruct.py b/eval/chat_benchmarks/JEEBench/eval_instruct.py index 2094acaf..eef52a54 100644 --- a/eval/chat_benchmarks/JEEBench/eval_instruct.py +++ b/eval/chat_benchmarks/JEEBench/eval_instruct.py @@ -118,9 +118,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: # Prepare instances for model all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -154,10 +153,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for JEEBench...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for JEEBench...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: return None diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py index e5c7255e..5aa4738d 100644 --- a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py +++ b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py @@ -85,9 +85,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: examples = examples[:10] all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -122,10 +121,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: instance.repeat_idx = i all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for LiveCodeBench...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for LiveCodeBench...") + all_outputs = self.compute(model, all_instances) + + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: diff --git a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py index 21a39e7d..b4fce775 100644 --- a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py +++ b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py @@ -81,9 +81,8 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: examples = examples[:10] all_outputs = [] - + all_instances = [] for i in range(self.n_repeat): - all_instances = [] seed = [s + i for s in self.seed] for idx, example in enumerate(examples): @@ -118,11 +117,13 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: instance.repeat_idx = i all_instances.append(instance) - # Generate model responses - self.logger.info("Generating responses for LiveCodeBenchV5...") - outputs = self.compute(model, all_instances) - all_outputs.append(outputs) + # Generate model responses + self.logger.info("Generating responses for LiveCodeBenchV5...") + all_outputs = self.compute(model, all_instances) + all_outputs = [ + all_outputs[i: i + len(examples)] for i in range(0, len(examples) * self.n_repeat, len(examples)) + ] # Return None early for non-primary ranks if model.rank != 0: return None